From 0d0748ae6896c7ab235dffb2a0c8a49e16fad7f8 Mon Sep 17 00:00:00 2001 From: Luke Chen Date: Thu, 3 Oct 2019 05:44:38 +0000 Subject: Import wiredtiger: e0041ca53c1c1a4a23cc7aaa7ef8137dc1c61117 from branch mongodb-4.4 ref: 0cd668bf3a..e0041ca53c for: 4.3.1 WT-4702 Switch to ubuntu1804-test Evergreen distro WT-4715 Workloads will stall if old transaction or timestamp pinned by thread co-opted for eviction WT-4961 Checkpoints with cache overflow must keep history for reads WT-5093 Enable million-collection-test working with Evergreen distro rhel80-build WT-5094 Enable Windows compile task working with Evergreen distro windows-64-vs2017-test WT-5122 Shut down the sweep server before doing the final checkpoint WT-5128 Add script to run wtperf with XRay profiling WT-5130 Enable Big-endian (s390x/zSeries) working with Evergreen distro ubuntu1804-zseries-build WT-5135 Change lookaside file inserts to use cursor.insert WT-5140 Fix where a cursor returning random items can use an uninitialized buffer WT-5143 Fix typo in error message --- src/third_party/wiredtiger/SConstruct | 1 + .../wiredtiger/bench/wtperf/runners/wtperf_xray.sh | 122 ++++++++++++++ src/third_party/wiredtiger/bench/wtperf/wtperf.c | 3 +- src/third_party/wiredtiger/dist/api_data.py | 25 ++- src/third_party/wiredtiger/dist/filelist | 1 - src/third_party/wiredtiger/dist/s_define.list | 1 - src/third_party/wiredtiger/examples/c/ex_all.c | 2 + src/third_party/wiredtiger/import.data | 2 +- src/third_party/wiredtiger/src/btree/bt_random.c | 12 +- src/third_party/wiredtiger/src/btree/bt_read.c | 21 +-- src/third_party/wiredtiger/src/btree/bt_sync.c | 14 +- src/third_party/wiredtiger/src/btree/row_key.c | 12 +- src/third_party/wiredtiger/src/cache/cache_las.c | 51 +++--- .../src/checksum/zseries/slicing-consts.h | 6 + src/third_party/wiredtiger/src/config/config_def.c | 97 ++++++----- src/third_party/wiredtiger/src/conn/conn_api.c | 49 ++---- src/third_party/wiredtiger/src/conn/conn_open.c | 8 +- src/third_party/wiredtiger/src/evict/evict_lru.c | 36 ++-- src/third_party/wiredtiger/src/evict/evict_page.c | 11 +- src/third_party/wiredtiger/src/include/api.h | 4 +- src/third_party/wiredtiger/src/include/btmem.h | 38 +++-- src/third_party/wiredtiger/src/include/btree.i | 6 +- .../wiredtiger/src/include/connection.h | 2 + src/third_party/wiredtiger/src/include/extern.h | 26 +-- .../wiredtiger/src/include/extern_posix.h | 3 +- src/third_party/wiredtiger/src/include/misc.i | 43 ----- src/third_party/wiredtiger/src/include/reconcile.h | 9 +- src/third_party/wiredtiger/src/include/session.h | 7 +- src/third_party/wiredtiger/src/include/time.i | 182 +++++++++++++++++++++ src/third_party/wiredtiger/src/include/txn.i | 55 +------ .../wiredtiger/src/include/wiredtiger.in | 49 ++++-- .../wiredtiger/src/include/wt_internal.h | 1 + src/third_party/wiredtiger/src/os_posix/os_time.c | 1 + .../wiredtiger/src/reconcile/rec_child.c | 5 +- .../wiredtiger/src/reconcile/rec_visibility.c | 136 ++++++--------- .../wiredtiger/src/reconcile/rec_write.c | 36 +--- src/third_party/wiredtiger/src/support/time.c | 109 ------------ src/third_party/wiredtiger/src/txn/txn.c | 151 ++++++++++++++++- src/third_party/wiredtiger/src/txn/txn_ckpt.c | 27 ++- src/third_party/wiredtiger/src/txn/txn_recover.c | 2 +- .../wiredtiger/src/txn/txn_rollback_to_stable.c | 29 ++-- src/third_party/wiredtiger/test/evergreen.yml | 30 ++-- src/third_party/wiredtiger/test/format/Makefile.am | 2 +- src/third_party/wiredtiger/test/format/config.h | 3 + src/third_party/wiredtiger/test/format/format.h | 2 + src/third_party/wiredtiger/test/format/ops.c | 7 +- src/third_party/wiredtiger/test/format/random.c | 95 +++++++++++ src/third_party/wiredtiger/test/format/wts.c | 3 +- .../wiredtiger/test/suite/test_debug_mode05.py | 2 + .../wiredtiger/test/suite/test_las01.py | 9 +- .../wiredtiger/test/suite/test_timestamp04.py | 7 +- .../wiredtiger/test/suite/test_timestamp06.py | 26 ++- .../wiredtiger/test/suite/test_timestamp11.py | 1 + .../wiredtiger/test/suite/test_timestamp16.py | 14 +- .../wiredtiger/test/suite/test_txn21.py | 49 ++++++ 55 files changed, 1011 insertions(+), 634 deletions(-) create mode 100644 src/third_party/wiredtiger/bench/wtperf/runners/wtperf_xray.sh create mode 100644 src/third_party/wiredtiger/src/include/time.i delete mode 100644 src/third_party/wiredtiger/src/support/time.c create mode 100644 src/third_party/wiredtiger/test/format/random.c create mode 100644 src/third_party/wiredtiger/test/suite/test_txn21.py (limited to 'src') diff --git a/src/third_party/wiredtiger/SConstruct b/src/third_party/wiredtiger/SConstruct index f895a53c426..f4d0d31dab0 100644 --- a/src/third_party/wiredtiger/SConstruct +++ b/src/third_party/wiredtiger/SConstruct @@ -496,6 +496,7 @@ t = env.Program("t_format", "test/format/lrt.c", "test/format/ops.c", "test/format/rebalance.c", + "test/format/random.c", "test/format/salvage.c", "test/format/snap.c", "test/format/t.c", diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_xray.sh b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_xray.sh new file mode 100644 index 00000000000..398c6a9bcf5 --- /dev/null +++ b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_xray.sh @@ -0,0 +1,122 @@ +#!/bin/bash + +# wtperf_xray.sh - run wtperf regression tests with xray profiling and generate +# profiling information. +# +# This script assumes it is running in the directory with the wtperf executable. +# +# Usage +# wtperf_xray.sh [-h output-directory] [wtperf other args] +# +# This script checks the first argument after the wtperf configuration to see +# whether a home directory is being specified with the -h flag. If so, this +# script will write its output files to that directory. Otherwise it will +# default to WT_TEST (wtperf's default). +# +# Environment variables +# XRAY_BINARY -- +# The binary to use to inspect the xray log. (default: llvm-xray) +# FLAME_GRAPH_PATH -- +# The path to your copy of Brendan Gregg's FlameGraph script. (optional) +# +# When this is complete you can find information in the following files: +# wtperf_account.txt -- +# The top 10 functions where the workload is spending the most time along +# with a count, min, max and some percentiles for each one. +# wtperf_stacks.txt -- +# The top 10 stack traces where the workload is spending the most time. +# This calculation is done separately per thread. +# wtperf_graph.svg -- +# A function call graph showing what functions call each other. The edges +# are labelled and coloured proportionally to represent the ratio of time +# spent in each function call. +# wtperf_flame.svg -- +# A graph visualising stack traces and the time spent within each stack +# frame. If FLAME_GRAPH_PATH is not specified, this graph won't be +# generated. +# +if ! test -f ./wtperf; then + echo "$0: could not find wtperf in current working directory" + exit 1 +fi + +if test "$#" -lt "1"; then + echo "$0: must specify wtperf configuration to run" + exit 1 +fi + +# By default, wtperf uses WT_TEST as its home directory. +xray_home="WT_TEST" +if test "$2" = "-h"; then + if ! test -z "$3"; then + xray_home="$3" + fi +fi +echo "$0: using $xray_home as home directory" + +# Check symbols to ensure we've compiled with XRay. +objdump_out=$(objdump -h -j xray_instr_map ./wtperf) +if test -z "$objdump_out"; then + echo "$0: wtperf not compiled with xray, add '-fxray-instrument' to your CFLAGS" + exit 1 +fi + +if ! test -d "$xray_home"; then + echo "$0: creating directory $xray_home" + mkdir "$xray_home" +fi + +xray_account_path="${xray_home}/wtperf_account.txt" +xray_stack_path="${xray_home}/wtperf_stack.txt" +xray_graph_path="${xray_home}/wtperf_graph.svg" +xray_flame_path="${xray_home}/wtperf_flame.svg" + +rm xray-log.wtperf.* \ + "$xray_account_path" \ + "$xray_stack_path" \ + "$xray_graph_path" \ + "$xray_flame_path" + +export XRAY_OPTIONS="patch_premain=true xray_mode=xray-basic verbosity=1" +./wtperf -O "$@" + +xray_log=$(ls xray-log.wtperf.*) +num_logs=$(echo "$xray_log" | wc -w) +if test "$num_logs" -ne "1"; then + echo "$0: detected more than one xray log" + exit 1 +fi + +if test -z "$XRAY_BINARY"; then + xray_bin="llvm-xray" + echo "$0: XRAY_BINARY is unset, defaulting to $xray_bin" +else + xray_bin="$XRAY_BINARY" +fi + +$xray_bin account "$xray_log" \ + -top=10 -sort=sum -sortorder=dsc -instr_map ./wtperf > \ + "$xray_account_path" + +# Use the -per-thread-stacks option to get the top 10 stacks for each thread. +# We could use the -aggregate-threads flag here so get the top stacks for all threads (omitting duplicates). +$xray_bin stack -per-thread-stacks "$xray_log" \ + -instr_map ./wtperf > \ + "$xray_stack_path" + +# Generate a DOT graph. +$xray_bin graph "$xray_log" \ + -m ./wtperf -color-edges=sum -edge-label=sum | \ + unflatten -f -l10 | \ + dot -Tsvg -o "$xray_graph_path" + +# This file can be inspected in the Google Chrome Trace Viewer. +# It seems to take a long time to generate this so just disable it for now. +# $xray_bin convert -symbolize -instr_map=./wtperf -output-format=trace_event $xray_log | gzip > wtperf_trace.txt.gz +if test -z "$FLAME_GRAPH_PATH"; then + echo "$0: FLAME_GRAPH_PATH is unset, skipping flame graph generation" +else + $xray_bin stack "$xray_log" \ + -instr_map ./wtperf -stack-format=flame -aggregation-type=time -all-stacks | \ + "$FLAME_GRAPH_PATH" > "$xray_flame_path" +fi diff --git a/src/third_party/wiredtiger/bench/wtperf/wtperf.c b/src/third_party/wiredtiger/bench/wtperf/wtperf.c index cf12df3f2fc..b659d83cbc7 100644 --- a/src/third_party/wiredtiger/bench/wtperf/wtperf.c +++ b/src/third_party/wiredtiger/bench/wtperf/wtperf.c @@ -654,8 +654,7 @@ worker(void *arg) */ measure_latency = opts->sample_interval != 0 && trk != NULL && trk->ops != 0 && (trk->ops % opts->sample_rate == 0); - if (measure_latency) - __wt_epoch(NULL, &start); + __wt_epoch(NULL, &start); /* [-Werror=maybe-uninitialized] */ cursor->set_key(cursor, key_buf); switch (*op) { diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 3bd75b7187c..6d9d4f1db3d 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -605,6 +605,13 @@ connection_runtime_config = [ Config('lsm_merge', 'true', r''' merge LSM chunks where possible (deprecated)''', type='boolean', undoc=True), + Config('operation_timeout_ms', '0', r''' + when non-zero, a requested limit on the number of elapsed real time milliseconds + application threads will take to complete database operations. Time is measured from the + start of each WiredTiger API call. There is no guarantee any operation will not take + longer than this amount of time. If WiredTiger notices the limit has been exceeded, an + operation may return a WT_ROLLBACK error. Default is to have no limit''', + min=1), Config('operation_tracking', '', r''' enable tracking of performance-critical functions. See @ref operation_tracking for more information''', @@ -1333,6 +1340,13 @@ methods = { choices=['read-uncommitted', 'read-committed', 'snapshot']), Config('name', '', r''' name of the transaction for tracing and debugging'''), + Config('operation_timeout_ms', '0', r''' + when non-zero, a requested limit on the number of elapsed real time milliseconds taken + to complete database operations in this transaction. Time is measured from the start + of each WiredTiger API call. There is no guarantee any operation will not take longer + than this amount of time. If WiredTiger notices the limit has been exceeded, an operation + may return a WT_ROLLBACK error. Default is to have no limit''', + min=1), Config('priority', 0, r''' priority of the transaction for resolving conflicts. Transactions with higher values are less likely to abort''', @@ -1436,8 +1450,8 @@ methods = { dropped while a hot backup is in progress or if open in a cursor''', type='list'), Config('force', 'false', r''' - by default, checkpoints may be skipped if the underlying object - has not been modified, this option forces the checkpoint''', + if false (the default), checkpoints may be skipped if the underlying object has not been + modified, if true, this option forces the checkpoint''', type='boolean'), Config('name', '', r''' if set, specify a name for the checkpoint (note that checkpoints @@ -1445,10 +1459,9 @@ methods = { Config('target', '', r''' if non-empty, checkpoint the list of objects''', type='list'), Config('use_timestamp', 'true', r''' - by default, create the checkpoint as of the last stable timestamp - if timestamps are in use, or all current updates if there is no - stable timestamp set. If false, this option generates a checkpoint - with all updates including those later than the timestamp''', + if true (the default), create the checkpoint as of the last stable timestamp if timestamps + are in use, or all current updates if there is no stable timestamp set. If false, this + option generates a checkpoint with all updates including those later than the timestamp''', type='boolean'), ]), diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist index 1d398a4aa88..9e7eb0b23ac 100644 --- a/src/third_party/wiredtiger/dist/filelist +++ b/src/third_party/wiredtiger/dist/filelist @@ -205,7 +205,6 @@ src/support/rand.c src/support/scratch.c src/support/stat.c src/support/thread_group.c -src/support/time.c src/txn/txn.c src/txn/txn_ckpt.c src/txn/txn_ext.c diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index f5e3584343d..85a240550ea 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -75,7 +75,6 @@ WT_TIMEDIFF_US WT_TRACK_OP WT_TRACK_OP_END WT_TRACK_OP_INIT -WT_TRACK_TIME WT_TRET_ERROR_OK WT_UPDATE_SIZE WT_USE_OPENAT diff --git a/src/third_party/wiredtiger/examples/c/ex_all.c b/src/third_party/wiredtiger/examples/c/ex_all.c index b9792fbc82b..ffc3c469881 100644 --- a/src/third_party/wiredtiger/examples/c/ex_all.c +++ b/src/third_party/wiredtiger/examples/c/ex_all.c @@ -928,6 +928,8 @@ transaction_ops(WT_SESSION *session_arg) error_check(conn->set_timestamp(conn, "stable_timestamp=2a")); /*! [set stable timestamp] */ + /* WT_CONNECTION.rollback_to_stable requires a timestamped checkpoint. */ + error_check(session->checkpoint(session, NULL)); /*! [rollback to stable] */ error_check(conn->rollback_to_stable(conn, NULL)); /*! [rollback to stable] */ diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 871a3a0366c..427cb8cd696 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "0cd668bf3ac3cdd5840d84d70205dabbb727278c", + "commit": "e0041ca53c1c1a4a23cc7aaa7ef8137dc1c61117", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-4.4" diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c index ae2c64a126d..f3f8b31b33e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_random.c +++ b/src/third_party/wiredtiger/src/btree/bt_random.c @@ -314,8 +314,8 @@ __random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) for (i = __wt_random(&session->rnd) % WT_RANDOM_CURSOR_MOVE;;) { ret = next ? __wt_btcur_next(cbt, false) : __wt_btcur_prev(cbt, false); if (ret == WT_NOTFOUND) { - next = false; /* Reverse direction from the end of the tree. */ - ret = __wt_btcur_prev(cbt, false); + next = !next; /* Reverse direction. */ + ret = next ? __wt_btcur_next(cbt, false) : __wt_btcur_prev(cbt, false); WT_RET(ret); /* An empty tree. */ } if (i > 0) @@ -324,8 +324,14 @@ __random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) /* * Skip the record we returned last time, once. Clear the tracking value so we don't * skip that record twice, it just means the tree is too small for anything reasonable. + * + * Testing WT_DATA_IN_ITEM requires explanation: the cursor temporary buffer is used to + * build keys for row-store searches and can point into the row-store page (which might + * have been freed subsequently). If a previous random call set the temporary buffer, + * then it will be local data. If it's local data for some other reason than a previous + * random call, we don't care: it won't match, and if it does we just retry. */ - if (cursor->key.size == cbt->tmp->size && + if (WT_DATA_IN_ITEM(cbt->tmp) && cursor->key.size == cbt->tmp->size && memcmp(cursor->key.data, cbt->tmp->data, cbt->tmp->size) == 0) { cbt->tmp->size = 0; i = __wt_random(&session->rnd) % WT_RANDOM_CURSOR_MOVE; diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index b21221439f6..176ade40575 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -116,6 +116,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) WT_DECL_RET; WT_ITEM las_key, las_value; WT_PAGE *page; + WT_PAGE_LOOKASIDE *page_las; WT_UPDATE *first_upd, *last_upd, *upd; wt_timestamp_t durable_timestamp, las_timestamp; size_t incr, total_incr; @@ -131,7 +132,8 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) locked = false; total_incr = 0; current_recno = recno = WT_RECNO_OOB; - las_pageid = ref->page_las->las_pageid; + page_las = ref->page_las; + las_pageid = page_las->las_pageid; session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_CLEAR(las_key); @@ -167,7 +169,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * Confirm the search using the unique prefix; if not a match, we're done searching for * records for this page. */ - if (las_pageid != ref->page_las->las_pageid) + if (las_pageid != page_las->las_pageid) break; /* Allocate the WT_UPDATE structure. */ @@ -265,12 +267,11 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) FLD_SET(page->modify->restore_state, WT_PAGE_RS_LOOKASIDE); - if (ref->page_las->skew_newest && !ref->page_las->has_prepares && + if (page_las->min_skipped_ts == WT_TS_MAX && !page_las->has_prepares && !S2C(session)->txn_global.has_stable_timestamp && - __wt_txn_visible_all( - session, ref->page_las->unstable_txn, ref->page_las->unstable_durable_timestamp)) { - page->modify->rec_max_txn = ref->page_las->max_txn; - page->modify->rec_max_timestamp = ref->page_las->max_timestamp; + __wt_txn_visible_all(session, page_las->max_txn, page_las->max_ondisk_ts)) { + page->modify->rec_max_txn = page_las->max_txn; + page->modify->rec_max_timestamp = page_las->max_ondisk_ts; __wt_page_modify_clear(session, page); } } @@ -279,8 +280,8 @@ __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * Now the lookaside history has been read into cache there is no further need to maintain a * reference to it. */ - ref->page_las->eviction_to_lookaside = false; - ref->page_las->resolved = true; + page_las->eviction_to_lookaside = false; + page_las->resolved = true; err: if (locked) @@ -543,7 +544,7 @@ skip_read: * Don't free WT_REF.page_las, there may be concurrent readers. */ if (final_state == WT_REF_MEM && ref->page_las != NULL && - (!ref->page_las->skew_newest || ref->page_las->has_prepares)) + (ref->page_las->min_skipped_ts != WT_TS_MAX || ref->page_las->has_prepares)) WT_ERR(__wt_las_remove_block(session, ref->page_las->las_pageid)); WT_REF_SET_STATE(ref, final_state); diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 3fdaf9c240e..a988793e6e7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -299,21 +299,9 @@ __wt_sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * cache clean but with history that cannot be * discarded), that is not wasted effort because * checkpoint doesn't need to write the page again. - * - * Once the transaction has given up it's snapshot it - * is no longer safe to reconcile pages. That happens - * prior to the final metadata checkpoint. - * - * XXX Only attempt this eviction when there are no - * readers older than the checkpoint. Otherwise, a bug - * in eviction can mark the page clean and discard - * history, causing those reads to incorrectly see - * newer versions of data than they should. */ if (!WT_PAGE_IS_INTERNAL(page) && page->read_gen == WT_READGEN_WONT_NEED && - !tried_eviction && F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT) && - (!F_ISSET(txn, WT_TXN_HAS_TS_READ) || - txn->read_timestamp == conn->txn_global.pinned_timestamp)) { + !tried_eviction) { WT_ERR_BUSY_OK(__wt_page_release_evict(session, walk, 0)); walk = prev; prev = NULL; diff --git a/src/third_party/wiredtiger/src/btree/row_key.c b/src/third_party/wiredtiger/src/btree/row_key.c index c017e7c8a9c..d278a5d0496 100644 --- a/src/third_party/wiredtiger/src/btree/row_key.c +++ b/src/third_party/wiredtiger/src/btree/row_key.c @@ -180,17 +180,15 @@ __wt_row_leaf_key_work( copy = WT_ROW_KEY_COPY(rip); #ifdef HAVE_DIAGNOSTIC /* - * Debugging added to detect and gather information for rare hang. Detect and abort if the - * current operation takes too long. + * Debugging added to detect and gather information for rare hang, WT-5043. Detect and abort + * if the current function call or operation takes too long (and 5 minutes is an eternity). */ __wt_seconds32(session, ¤t); WT_ERR_ASSERT(session, (current - start) < WT_MINUTE * 5, EINVAL, - "Current function call taking too long: current %" PRIu32 " func started %" PRIu32, - current, start); + "call tracking for WT-5043: %s took longer than 5 minutes", __func__); WT_ERR_ASSERT(session, - session->op_start == 0 || ((current - session->op_start) < WT_MINUTE * 5), EINVAL, - "Operation taking too long: current %" PRIu32 " started %" PRIu32, current, - session->op_start); + (session->op_5043_seconds == 0 || (current - session->op_5043_seconds) < WT_MINUTE * 5), + EINVAL, "operation tracking for WT-5043: %s took longer than 5 minutes", session->name); #endif /* diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index e1edcb596fa..aa05724b406 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -396,7 +396,6 @@ bool __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref) { WT_TXN *txn; - wt_timestamp_t unstable_timestamp; txn = &session->txn; @@ -425,13 +424,17 @@ __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref) /* * If some of the page's history overlaps with the reader's snapshot then we have to read it. - * This is only relevant if we chose versions that were unstable when the page was written. */ - if (ref->page_las->skew_newest && WT_TXNID_LE(txn->snap_min, ref->page_las->unstable_txn)) + if (WT_TXNID_LE(txn->snap_min, ref->page_las->max_txn)) return (false); + /* + * Otherwise, if not reading at a timestamp, the page's history is in the past, so the page + * image is correct if it contains the most recent versions of everything and nothing was + * prepared. + */ if (!F_ISSET(txn, WT_TXN_HAS_TS_READ)) - return (ref->page_las->skew_newest); + return (!ref->page_las->has_prepares && ref->page_las->min_skipped_ts == WT_TS_MAX); /* * Skip lookaside history if reading as of a timestamp, we evicted new @@ -439,21 +442,18 @@ __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref) * possible for prepared updates, because the commit timestamp was not * known when the page was evicted. * - * Skip lookaside pages if reading as of a timestamp, we evicted old - * versions of data and all the unstable updates are in the future. - * - * Checkpoint should respect durable timestamps, other reads should - * respect ordinary visibility. Checking for just the unstable updates - * during checkpoint would end up reading more content from lookaside - * than necessary. + * Otherwise, skip reading lookaside history if everything on the page + * is older than the read timestamp, and the oldest update in lookaside + * newer than the page is in the future of the reader. This seems + * unlikely, but is exactly what eviction tries to do when a checkpoint + * is running. */ - unstable_timestamp = WT_SESSION_IS_CHECKPOINT(session) ? - ref->page_las->unstable_durable_timestamp : - ref->page_las->unstable_timestamp; - if (ref->page_las->skew_newest && !ref->page_las->has_prepares && - txn->read_timestamp > unstable_timestamp) + if (!ref->page_las->has_prepares && ref->page_las->min_skipped_ts == WT_TS_MAX && + txn->read_timestamp >= ref->page_las->max_ondisk_ts) return (true); - if (!ref->page_las->skew_newest && txn->read_timestamp < unstable_timestamp) + + if (txn->read_timestamp >= ref->page_las->max_ondisk_ts && + txn->read_timestamp < ref->page_las->min_skipped_ts) return (true); return (false); @@ -586,16 +586,15 @@ __las_insert_block_verbose(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_MULTI * "file ID %" PRIu32 ", page ID %" PRIu64 ". " "Max txn ID %" PRIu64 - ", unstable timestamp %s," - " unstable durable timestamp %s, %s. " + ", max ondisk timestamp %s, " + "first skipped ts %s. " "Entries now in lookaside file: %" PRId64 ", " "cache dirty: %2.3f%% , " "cache use: %2.3f%%", btree_id, multi->page_las.las_pageid, multi->page_las.max_txn, - __wt_timestamp_to_string(multi->page_las.unstable_timestamp, ts_string[0]), - __wt_timestamp_to_string(multi->page_las.unstable_durable_timestamp, ts_string[1]), - multi->page_las.skew_newest ? "newest" : "not newest", + __wt_timestamp_to_string(multi->page_las.max_ondisk_ts, ts_string[0]), + __wt_timestamp_to_string(multi->page_las.min_skipped_ts, ts_string[1]), WT_STAT_READ(conn->stats, cache_lookaside_entries), pct_dirty, pct_full); } @@ -746,18 +745,14 @@ __wt_las_insert_block( if (upd == list->onpage_upd && upd->size > 0 && (upd->type == WT_UPDATE_STANDARD || upd->type == WT_UPDATE_MODIFY)) { las_value.size = 0; - WT_ASSERT(session, upd != first_upd || multi->page_las.skew_newest); cursor->set_value(cursor, upd->txnid, upd->start_ts, upd->durable_ts, upd->prepare_state, WT_UPDATE_BIRTHMARK, &las_value); } else cursor->set_value(cursor, upd->txnid, upd->start_ts, upd->durable_ts, upd->prepare_state, upd->type, &las_value); - /* - * Using update looks a little strange because the keys are guaranteed to not exist, but - * since we're appending, we want the cursor to stay positioned in between inserts. - */ - WT_ERR(cursor->update(cursor)); + /* Using insert so we don't keep the page pinned longer than necessary. */ + WT_ERR(cursor->insert(cursor)); ++insert_cnt; if (upd->prepare_state == WT_PREPARE_INPROGRESS) ++prepared_insert_cnt; diff --git a/src/third_party/wiredtiger/src/checksum/zseries/slicing-consts.h b/src/third_party/wiredtiger/src/checksum/zseries/slicing-consts.h index dae4b9d1c1e..88ddc900243 100644 --- a/src/third_party/wiredtiger/src/checksum/zseries/slicing-consts.h +++ b/src/third_party/wiredtiger/src/checksum/zseries/slicing-consts.h @@ -1,4 +1,5 @@ /* CRC-32 and CRC-32C slicing-by-8 constants, for use on big-endian systems. */ +#if 0 static const unsigned int __attribute__((aligned(128))) crc32table_le[8][256] = { {0x00000000, 0x96300777, 0x2c610eee, 0xba510999, 0x19c46d07, 0x8ff46a70, 0x35a563e9, 0xa395649e, 0x3288db0e, 0xa4b8dc79, 0x1ee9d5e0, 0x88d9d297, 0x2b4cb609, 0xbd7cb17e, 0x072db8e7, 0x911dbf90, @@ -257,7 +258,9 @@ static const unsigned int __attribute__((aligned(128))) crc32table_le[8][256] = 0x4a146bff, 0xd414c133, 0x37134ebd, 0xa913e471, 0xb01a217b, 0x2e1a8bb7, 0xcd1d0439, 0x531daef5, 0xff0f8e2c, 0x610f24e0, 0x8208ab6e, 0x1c0801a2, 0x0501c4a8, 0x9b016e64, 0x7806e1ea, 0xe6064b26}}; +#endif /* NOT CURRENTLY USED */ +#if 0 static const unsigned int __attribute__((aligned(128))) crc32table_be[8][256] = { {0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005, 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61, 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd, @@ -516,6 +519,7 @@ static const unsigned int __attribute__((aligned(128))) crc32table_be[8][256] = 0x3548049b, 0x6ee9d851, 0x820bbd0f, 0xd9aa61c5, 0x5f0e6a04, 0x04afb6ce, 0xe84dd390, 0xb3ec0f5a, 0xe1c4d9a5, 0xba65056f, 0x56876031, 0x0d26bcfb, 0x8b82b73a, 0xd0236bf0, 0x3cc10eae, 0x6760d264}}; +#endif /* NOT CURRENTLY USED */ static const unsigned int __attribute__((aligned(128))) crc32ctable_le[8][256] = { {0x00000000, 0x03836bf2, 0xf7703be1, 0xf4f35013, 0x1f979ac7, 0x1c14f135, 0xe8e7a126, 0xeb64cad4, @@ -776,6 +780,7 @@ static const unsigned int __attribute__((aligned(128))) crc32ctable_le[8][256] = 0xa1354ce5, 0x864870ac, 0xefcf3477, 0xc8b2083e, 0xccb751c4, 0xebca6d8d, 0x824d2956, 0xa530151f}}; +#if 0 static const unsigned int __attribute__((aligned(128))) crc32ctable_be[8][256] = { {0x00000000, 0x1edc6f41, 0x3db8de82, 0x2364b1c3, 0x7b71bd04, 0x65add245, 0x46c96386, 0x58150cc7, 0xf6e37a08, 0xe83f1549, 0xcb5ba48a, 0xd587cbcb, 0x8d92c70c, 0x934ea84d, 0xb02a198e, 0xaef676cf, @@ -1034,3 +1039,4 @@ static const unsigned int __attribute__((aligned(128))) crc32ctable_be[8][256] = 0x7b80461d, 0x5de9c631, 0x37534645, 0x113ac669, 0xe22646ad, 0xc44fc681, 0xaef546f5, 0x889cc6d9, 0x5610283c, 0x7079a810, 0x1ac32864, 0x3caaa848, 0xcfb6288c, 0xe9dfa8a0, 0x836528d4, 0xa50ca8f8}}; +#endif /* NOT CURRENTLY USED */ diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index e23c4dd4c5e..958c267a7ce 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -118,6 +118,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { {"log", "category", NULL, NULL, confchk_WT_CONNECTION_reconfigure_log_subconfigs, 4}, {"lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2}, {"lsm_merge", "boolean", NULL, NULL, NULL, 0}, + {"operation_timeout_ms", "int", NULL, "min=1", NULL, 0}, {"operation_tracking", "category", NULL, NULL, confchk_wiredtiger_open_operation_tracking_subconfigs, 2}, {"shared_cache", "category", NULL, NULL, confchk_wiredtiger_open_shared_cache_subconfigs, 5}, @@ -190,7 +191,8 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_begin_transaction[] = { "choices=[\"read-uncommitted\",\"read-committed\"," "\"snapshot\"]", NULL, 0}, - {"name", "string", NULL, NULL, NULL, 0}, {"priority", "int", NULL, "min=-100,max=100", NULL, 0}, + {"name", "string", NULL, NULL, NULL, 0}, {"operation_timeout_ms", "int", NULL, "min=1", NULL, 0}, + {"priority", "int", NULL, "min=-100,max=100", NULL, 0}, {"read_timestamp", "string", NULL, NULL, NULL, 0}, {"roundup_timestamps", "category", NULL, NULL, confchk_WT_SESSION_begin_transaction_roundup_timestamps_subconfigs, 2}, @@ -551,6 +553,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { {"lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2}, {"lsm_merge", "boolean", NULL, NULL, NULL, 0}, {"mmap", "boolean", NULL, NULL, NULL, 0}, {"multiprocess", "boolean", NULL, NULL, NULL, 0}, + {"operation_timeout_ms", "int", NULL, "min=1", NULL, 0}, {"operation_tracking", "category", NULL, NULL, confchk_wiredtiger_open_operation_tracking_subconfigs, 2}, {"readonly", "boolean", NULL, NULL, NULL, 0}, {"salvage", "boolean", NULL, NULL, NULL, 0}, @@ -618,6 +621,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { {"lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2}, {"lsm_merge", "boolean", NULL, NULL, NULL, 0}, {"mmap", "boolean", NULL, NULL, NULL, 0}, {"multiprocess", "boolean", NULL, NULL, NULL, 0}, + {"operation_timeout_ms", "int", NULL, "min=1", NULL, 0}, {"operation_tracking", "category", NULL, NULL, confchk_wiredtiger_open_operation_tracking_subconfigs, 2}, {"readonly", "boolean", NULL, NULL, NULL, 0}, {"salvage", "boolean", NULL, NULL, NULL, 0}, @@ -685,6 +689,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { {"lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2}, {"lsm_merge", "boolean", NULL, NULL, NULL, 0}, {"mmap", "boolean", NULL, NULL, NULL, 0}, {"multiprocess", "boolean", NULL, NULL, NULL, 0}, + {"operation_timeout_ms", "int", NULL, "min=1", NULL, 0}, {"operation_tracking", "category", NULL, NULL, confchk_wiredtiger_open_operation_tracking_subconfigs, 2}, {"readonly", "boolean", NULL, NULL, NULL, 0}, {"salvage", "boolean", NULL, NULL, NULL, 0}, @@ -750,6 +755,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { {"lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2}, {"lsm_merge", "boolean", NULL, NULL, NULL, 0}, {"mmap", "boolean", NULL, NULL, NULL, 0}, {"multiprocess", "boolean", NULL, NULL, NULL, 0}, + {"operation_timeout_ms", "int", NULL, "min=1", NULL, 0}, {"operation_tracking", "category", NULL, NULL, confchk_wiredtiger_open_operation_tracking_subconfigs, 2}, {"readonly", "boolean", NULL, NULL, NULL, 0}, {"salvage", "boolean", NULL, NULL, NULL, 0}, @@ -815,12 +821,12 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator", "close_scan_interval=10),io_capacity=(total=0),log=(archive=true," "os_cache_dirty_pct=0,prealloc=true,zero_fill=false)," "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true," - "operation_tracking=(enabled=false,path=\".\")," - "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," - "statistics=none,statistics_log=(json=false,on_close=false," - "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," + "operation_timeout_ms=0,operation_tracking=(enabled=false," + "path=\".\"),shared_cache=(chunk=10MB,name=,quota=0,reserve=0," + "size=500MB),statistics=none,statistics_log=(json=false," + "on_close=false,sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "timing_stress_for_test=,verbose=", - confchk_WT_CONNECTION_reconfigure, 26}, + confchk_WT_CONNECTION_reconfigure, 27}, {"WT_CONNECTION.rollback_to_stable", "", NULL, 0}, {"WT_CONNECTION.set_file_system", "", NULL, 0}, {"WT_CONNECTION.set_timestamp", "commit_timestamp=,durable_timestamp=,force=false," @@ -836,9 +842,10 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator", ",os_cache_max=0", confchk_WT_SESSION_alter, 8}, {"WT_SESSION.begin_transaction", - "ignore_prepare=false,isolation=,name=,priority=0,read_timestamp=" - ",roundup_timestamps=(prepared=false,read=false),snapshot=,sync=", - confchk_WT_SESSION_begin_transaction, 8}, + "ignore_prepare=false,isolation=,name=,operation_timeout_ms=0," + "priority=0,read_timestamp=,roundup_timestamps=(prepared=false," + "read=false),snapshot=,sync=", + confchk_WT_SESSION_begin_transaction, 9}, {"WT_SESSION.checkpoint", "drop=,force=false,name=,target=,use_timestamp=true", confchk_WT_SESSION_checkpoint, 5}, {"WT_SESSION.close", "", NULL, 0}, @@ -989,16 +996,16 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator", "enabled=false,file_max=100MB,os_cache_dirty_pct=0,path=\".\"," "prealloc=true,recover=on,zero_fill=false)," "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true," - "mmap=true,multiprocess=false,operation_tracking=(enabled=false," - "path=\".\"),readonly=false,salvage=false,session_max=100," - "session_scratch_max=2MB,session_table_cache=true," - "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," - "statistics=none,statistics_log=(json=false,on_close=false," - "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," - "timing_stress_for_test=,transaction_sync=(enabled=false," - "method=fsync),use_environment=true,use_environment_priv=false," + "mmap=true,multiprocess=false,operation_timeout_ms=0," + "operation_tracking=(enabled=false,path=\".\"),readonly=false," + "salvage=false,session_max=100,session_scratch_max=2MB," + "session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0," + "reserve=0,size=500MB),statistics=none,statistics_log=(json=false" + ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\"" + ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false" + ",method=fsync),use_environment=true,use_environment_priv=false," "verbose=,write_through=", - confchk_wiredtiger_open, 50}, + confchk_wiredtiger_open, 51}, {"wiredtiger_open_all", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" ",builtin_extension_config=,cache_cursors=true," @@ -1019,16 +1026,16 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator", "enabled=false,file_max=100MB,os_cache_dirty_pct=0,path=\".\"," "prealloc=true,recover=on,zero_fill=false)," "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true," - "mmap=true,multiprocess=false,operation_tracking=(enabled=false," - "path=\".\"),readonly=false,salvage=false,session_max=100," - "session_scratch_max=2MB,session_table_cache=true," - "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," - "statistics=none,statistics_log=(json=false,on_close=false," - "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," - "timing_stress_for_test=,transaction_sync=(enabled=false," - "method=fsync),use_environment=true,use_environment_priv=false," + "mmap=true,multiprocess=false,operation_timeout_ms=0," + "operation_tracking=(enabled=false,path=\".\"),readonly=false," + "salvage=false,session_max=100,session_scratch_max=2MB," + "session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0," + "reserve=0,size=500MB),statistics=none,statistics_log=(json=false" + ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\"" + ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false" + ",method=fsync),use_environment=true,use_environment_priv=false," "verbose=,version=(major=0,minor=0),write_through=", - confchk_wiredtiger_open_all, 51}, + confchk_wiredtiger_open_all, 52}, {"wiredtiger_open_basecfg", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" ",builtin_extension_config=,cache_cursors=true," @@ -1047,15 +1054,15 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator", "enabled=false,file_max=100MB,os_cache_dirty_pct=0,path=\".\"," "prealloc=true,recover=on,zero_fill=false)," "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true," - "mmap=true,multiprocess=false,operation_tracking=(enabled=false," - "path=\".\"),readonly=false,salvage=false,session_max=100," - "session_scratch_max=2MB,session_table_cache=true," - "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," - "statistics=none,statistics_log=(json=false,on_close=false," - "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," - "timing_stress_for_test=,transaction_sync=(enabled=false," - "method=fsync),verbose=,version=(major=0,minor=0),write_through=", - confchk_wiredtiger_open_basecfg, 45}, + "mmap=true,multiprocess=false,operation_timeout_ms=0," + "operation_tracking=(enabled=false,path=\".\"),readonly=false," + "salvage=false,session_max=100,session_scratch_max=2MB," + "session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0," + "reserve=0,size=500MB),statistics=none,statistics_log=(json=false" + ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\"" + ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false" + ",method=fsync),verbose=,version=(major=0,minor=0),write_through=", + confchk_wiredtiger_open_basecfg, 46}, {"wiredtiger_open_usercfg", "async=(enabled=false,ops_max=1024,threads=2),buffer_alignment=-1" ",builtin_extension_config=,cache_cursors=true," @@ -1074,15 +1081,15 @@ static const WT_CONFIG_ENTRY config_entries[] = {{"WT_CONNECTION.add_collator", "enabled=false,file_max=100MB,os_cache_dirty_pct=0,path=\".\"," "prealloc=true,recover=on,zero_fill=false)," "lsm_manager=(merge=true,worker_thread_max=4),lsm_merge=true," - "mmap=true,multiprocess=false,operation_tracking=(enabled=false," - "path=\".\"),readonly=false,salvage=false,session_max=100," - "session_scratch_max=2MB,session_table_cache=true," - "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," - "statistics=none,statistics_log=(json=false,on_close=false," - "path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," - "timing_stress_for_test=,transaction_sync=(enabled=false," - "method=fsync),verbose=,write_through=", - confchk_wiredtiger_open_usercfg, 44}, + "mmap=true,multiprocess=false,operation_timeout_ms=0," + "operation_tracking=(enabled=false,path=\".\"),readonly=false," + "salvage=false,session_max=100,session_scratch_max=2MB," + "session_table_cache=true,shared_cache=(chunk=10MB,name=,quota=0," + "reserve=0,size=500MB),statistics=none,statistics_log=(json=false" + ",on_close=false,path=\".\",sources=,timestamp=\"%b %d %H:%M:%S\"" + ",wait=0),timing_stress_for_test=,transaction_sync=(enabled=false" + ",method=fsync),verbose=,write_through=", + confchk_wiredtiger_open_usercfg, 45}, {NULL, NULL, NULL, 0}}; int diff --git a/src/third_party/wiredtiger/src/conn/conn_api.c b/src/third_party/wiredtiger/src/conn/conn_api.c index c7e776c62c4..9e7964758ff 100644 --- a/src/third_party/wiredtiger/src/conn/conn_api.c +++ b/src/third_party/wiredtiger/src/conn/conn_api.c @@ -1010,7 +1010,6 @@ __conn_close(WT_CONNECTION *wt_conn, const char *config) WT_SESSION *wt_session; WT_SESSION_IMPL *s, *session; uint32_t i; - const char *ckpt_cfg; conn = (WT_CONNECTION_IMPL *)wt_conn; @@ -1074,47 +1073,24 @@ err: WT_TRET(__wt_lsm_manager_destroy(session)); /* - * After the async and LSM threads have exited, we shouldn't opening any more files. + * After the async and LSM threads have exited, we won't open more files for the application. + * However, the sweep server is still running and it can close file handles at the same time the + * final checkpoint is reviewing open data handles (forcing checkpoint to reopen handles). Shut + * down the sweep server and then flag the system should not open anything new. */ + WT_TRET(__wt_sweep_destroy(session)); F_SET(conn, WT_CONN_CLOSING_NO_MORE_OPENS); WT_FULL_BARRIER(); - /* The default session is used to access data handles during close. */ - F_CLR(session, WT_SESSION_NO_DATA_HANDLES); - /* - * Perform a system-wide checkpoint so that all tables are consistent with each other. All - * transactions are resolved but ignore timestamps to make sure all data gets to disk. Do this - * before shutting down all the subsystems. We have shut down all user sessions, but send in - * true for waiting for internal races. + * Shut down the checkpoint and capacity server threads: we don't want to throttle writes and + * we're about to do a final checkpoint separately from the checkpoint server. */ - WT_TRET(__wt_config_gets(session, cfg, "use_timestamp", &cval)); - ckpt_cfg = "use_timestamp=false"; - if (cval.val != 0) { - ckpt_cfg = "use_timestamp=true"; - if (conn->txn_global.has_stable_timestamp) - F_SET(conn, WT_CONN_CLOSING_TIMESTAMP); - } - if (!F_ISSET(conn, WT_CONN_IN_MEMORY | WT_CONN_READONLY)) { - s = NULL; - WT_TRET(__wt_open_internal_session(conn, "close_ckpt", true, 0, &s)); - if (s != NULL) { - const char *checkpoint_cfg[] = { - WT_CONFIG_BASE(session, WT_SESSION_checkpoint), ckpt_cfg, NULL}; - wt_session = &s->iface; - WT_TRET(__wt_txn_checkpoint(s, checkpoint_cfg, true)); + WT_TRET(__wt_capacity_server_destroy(session)); + WT_TRET(__wt_checkpoint_server_destroy(session)); - /* - * Mark the metadata dirty so we flush it on close, allowing recovery to be skipped. - */ - WT_WITH_DHANDLE(s, WT_SESSION_META_DHANDLE(s), __wt_tree_modify_set(s)); - - WT_TRET(wt_session->close(wt_session, config)); - } - } - - /* Shut down the global transaction state. */ - __wt_txn_global_shutdown(session); + /* Perform a final checkpoint and shut down the global transaction state. */ + WT_TRET(__wt_txn_global_shutdown(session, config, cfg)); if (ret != 0) { __wt_err(session, ret, "failure during close, disabling further writes"); @@ -2574,6 +2550,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval)); conn->mmap = cval.val != 0; + WT_ERR(__wt_config_gets(session, cfg, "operation_timeout_ms", &cval)); + conn->operation_timeout_us = (uint64_t)(cval.val * WT_THOUSAND); + WT_ERR(__wt_config_gets(session, cfg, "salvage", &cval)); if (cval.val) { if (F_ISSET(conn, WT_CONN_READONLY)) diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index fc352bbf821..f7e338ac9bb 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -82,10 +82,12 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) F_SET(conn, WT_CONN_CLOSING); WT_FULL_BARRIER(); + /* The default session is used to access data handles during close. */ + F_CLR(session, WT_SESSION_NO_DATA_HANDLES); + /* - * Shut down server threads other than the eviction server, which is needed later to close btree - * handles. Some of these threads access btree handles, so take care in ordering shutdown to - * make sure they exit before files are closed. + * Shut down server threads. Some of these threads access btree handles and eviction, shut them + * down before the eviction server, and shut all servers down before closing open data handles. */ WT_TRET(__wt_capacity_server_destroy(session)); WT_TRET(__wt_checkpoint_server_destroy(session)); diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index c224a3b7b11..00d02886920 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -277,10 +277,12 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) conn = S2C(session); cache = conn->cache; - /* - * The thread group code calls us repeatedly. So each call is one pass through eviction. - */ - WT_TRACK_TIME(session); +/* + * The thread group code calls us repeatedly. So each call is one pass through eviction. + */ +#ifdef HAVE_DIAGNOSTIC + __wt_seconds32(session, &session->op_5043_seconds); +#endif if (conn->evict_server_running && __wt_spin_trylock(session, &cache->evict_pass_lock) == 0) { /* * Cannot use WT_WITH_PASS_LOCK because this is a try lock. Fix when that is supported. We @@ -2295,21 +2297,23 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d if (timer) time_start = __wt_clock(session); - WT_TRACK_TIME(session); +#ifdef HAVE_DIAGNOSTIC + __wt_seconds32(session, &session->op_5043_seconds); +#endif for (initial_progress = cache->eviction_progress;; ret = 0) { /* - * A pathological case: if we're the oldest transaction in the - * system and the eviction server is stuck trying to find space - * (and we're not in recovery, because those transactions can't - * be rolled back), abort the transaction to give up all hazard - * pointers before trying again. + * If eviction is stuck, check if this thread is likely causing problems and should be + * rolled back. Ignore if in recovery, those transactions can't be rolled back. */ - if (__wt_cache_stuck(session) && __wt_txn_am_oldest(session) && - !F_ISSET(conn, WT_CONN_RECOVERING)) { - --cache->evict_aggressive_score; - WT_STAT_CONN_INCR(session, txn_fail_cache); - WT_ERR( - __wt_txn_rollback_required(session, "oldest transaction rolled back for eviction")); + if (!F_ISSET(conn, WT_CONN_RECOVERING) && __wt_cache_stuck(session)) { + ret = __wt_txn_is_blocking_old(session); + if (ret == 0) + ret = __wt_txn_is_blocking_pin(session); + if (ret == WT_ROLLBACK) { + --cache->evict_aggressive_score; + WT_STAT_CONN_INCR(session, txn_fail_cache); + } + WT_ERR(ret); } /* diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 41ecfb40242..785c6219c6b 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -678,15 +678,8 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool /* Reconcile the page. */ ret = __wt_reconcile(session, ref, NULL, flags, lookaside_retryp); - - /* - * If attempting eviction during a checkpoint, we may successfully reconcile but then find that - * there are updates on the page too new to evict. Give up evicting in that case: checkpoint - * will include the reconciled page when it visits the parent. - */ - if (WT_SESSION_BTREE_SYNC(session) && !__wt_page_is_modified(page) && - !__wt_txn_visible_all(session, page->modify->rec_max_txn, page->modify->rec_max_timestamp)) - return (__wt_set_return(session, EBUSY)); + WT_ASSERT(session, __wt_page_is_modified(page) || + __wt_txn_visible_all(session, page->modify->rec_max_txn, page->modify->rec_max_timestamp)); /* * If reconciliation fails but reports it might succeed if we use the lookaside table, try again diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h index 817ccbae553..36cefa8dc68 100644 --- a/src/third_party/wiredtiger/src/include/api.h +++ b/src/third_party/wiredtiger/src/include/api.h @@ -27,11 +27,9 @@ #define WT_SINGLE_THREAD_CHECK_STOP(s) \ if (--(s)->api_enter_refcnt == 0) \ WT_PUBLISH((s)->api_tid, 0); -#define WT_TRACK_TIME(s) __wt_seconds32((s), &(s)->op_start) #else #define WT_SINGLE_THREAD_CHECK_START(s) #define WT_SINGLE_THREAD_CHECK_STOP(s) -#define WT_TRACK_TIME(s) (s)->op_start = 0 #endif /* Standard entry points to the API: declares/initializes local variables. */ @@ -46,8 +44,8 @@ * correct. \ */ \ WT_TRACK_OP_INIT(s); \ - (s)->op_start = 0; \ WT_SINGLE_THREAD_CHECK_START(s); \ + __wt_op_timer_start(s); \ WT_ERR(WT_SESSION_CHECK_PANIC(s)); \ /* Reset wait time if this isn't an API reentry. */ \ if (__oldname == NULL) \ diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 68d6f53c0f3..d168d10593c 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -230,21 +230,31 @@ struct __wt_ovfl_reuse { /* * WT_PAGE_LOOKASIDE -- - * Related information for on-disk pages with lookaside entries. + * Information for on-disk pages with lookaside entries. + * + * This information is used to decide whether history evicted to lookaside is + * needed for a read, and when it is no longer needed at all. We track the + * newest update written to the disk image in `max_ondisk_ts`, and the oldest + * update skipped to choose the on-disk version in `min_skipped_ts`. If no + * updates were skipped, then the disk image contains the newest versions of + * all updates and `min_skipped_ts == WT_TS_MAX`. + * + * For reads without a timestamp, we check that there are no skipped updates + * and that the reader's snapshot can see everything on disk. + * + * For readers with a timestamp, it is safe to ignore lookaside if either + * (a) there are no skipped updates and everything on disk is visible, or + * (b) everything on disk is visible, and the minimum skipped update is in + * the future of the reader. */ struct __wt_page_lookaside { - uint64_t las_pageid; /* Page ID in lookaside */ - uint64_t max_txn; /* Maximum transaction ID */ - uint64_t unstable_txn; /* First transaction ID not on page */ - wt_timestamp_t max_timestamp; /* Maximum timestamp */ - wt_timestamp_t unstable_timestamp; /* First timestamp not on page */ - wt_timestamp_t unstable_durable_timestamp; - /* First durable timestamp not on - * page */ - bool eviction_to_lookaside; /* Revert to lookaside on eviction */ - bool has_prepares; /* One or more updates are prepared */ - bool resolved; /* History has been read into cache */ - bool skew_newest; /* Page image has newest versions */ + uint64_t las_pageid; /* Page ID in lookaside */ + uint64_t max_txn; /* Maximum transaction ID */ + wt_timestamp_t max_ondisk_ts; /* Maximum timestamp on disk */ + wt_timestamp_t min_skipped_ts; /* Skipped in favor of disk version */ + bool eviction_to_lookaside; /* Revert to lookaside on eviction */ + bool has_prepares; /* One or more updates are prepared */ + bool resolved; /* History has been read into cache */ }; /* @@ -909,7 +919,7 @@ struct __wt_ref { WT_SESSION_IMPL *session; const char *name; const char *func; - uint32_t time_sec; /* DEBUGGING field for rare hang. */ + uint32_t time_sec; uint16_t line; uint16_t state; } hist[WT_REF_SAVE_STATE_MAX]; diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 3f80ee5cda7..2fa3e0d94d3 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -1160,12 +1160,10 @@ __wt_page_las_active(WT_SESSION_IMPL *session, WT_REF *ref) return (false); if (page_las->resolved) return (false); - if (!page_las->skew_newest || page_las->has_prepares) + if (page_las->min_skipped_ts != WT_TS_MAX || page_las->has_prepares) return (true); - if (__wt_txn_visible_all(session, page_las->max_txn, page_las->max_timestamp)) - return (false); - return (true); + return (!__wt_txn_visible_all(session, page_las->max_txn, page_las->max_ondisk_ts)); } /* diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index 174263c3949..32becc05467 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -204,6 +204,8 @@ struct __wt_connection_impl { /* Configuration */ const WT_CONFIG_ENTRY **config_entries; + uint64_t operation_timeout_us; /* Maximum operation period before rollback */ + const char *optrack_path; /* Directory for operation logs */ WT_FH *optrack_map_fh; /* Name to id translation file. */ WT_SPINLOCK optrack_map_spinlock; /* Translation file spinlock. */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index d02b4dca326..4844a88380c 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -1421,8 +1421,14 @@ extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_global_set_timestamp(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_txn_global_shutdown(WT_SESSION_IMPL *session, const char *config, const char **cfg) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_txn_is_blocking_old(WT_SESSION_IMPL *session) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_txn_is_blocking_pin(WT_SESSION_IMPL *session) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_log_op(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) @@ -1532,8 +1538,6 @@ extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern uint32_t __wt_split_page_size(int split_pct, uint32_t maxpagesize, uint32_t allocsize) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern uint64_t __wt_clock_to_nsec(uint64_t end, uint64_t begin) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern uint64_t __wt_ext_transaction_id(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern uint64_t __wt_ext_transaction_oldest(WT_EXTENSION_API *wt_api) @@ -1607,8 +1611,6 @@ extern void __wt_curtable_set_key(WT_CURSOR *cursor, ...); extern void __wt_curtable_set_value(WT_CURSOR *cursor, ...); extern void __wt_encrypt_size( WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep); -extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) - WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_err_func( WT_SESSION_IMPL *session, int error, const char *func, int line, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((format(printf, 5, 6))) @@ -1687,9 +1689,6 @@ extern void __wt_root_ref_init( extern void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l); extern void __wt_schema_destroy_colgroup(WT_SESSION_IMPL *session, WT_COLGROUP **colgroupp); extern void __wt_scr_discard(WT_SESSION_IMPL *session); -extern void __wt_seconds(WT_SESSION_IMPL *session, uint64_t *secondsp) - WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); -extern void __wt_seconds32(WT_SESSION_IMPL *session, uint32_t *secondsp); extern void __wt_session_close_cache(WT_SESSION_IMPL *session); extern void __wt_session_gen_enter(WT_SESSION_IMPL *session, int which); extern void __wt_session_gen_leave(WT_SESSION_IMPL *session, int which); @@ -1722,7 +1721,6 @@ extern void __wt_txn_clear_timestamp_queues(WT_SESSION_IMPL *session); extern void __wt_txn_destroy(WT_SESSION_IMPL *session); extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session); extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session); -extern void __wt_txn_global_shutdown(WT_SESSION_IMPL *session); extern void __wt_txn_named_snapshot_destroy(WT_SESSION_IMPL *session); extern void __wt_txn_op_free(WT_SESSION_IMPL *session, WT_TXN_OP *op); extern void __wt_txn_publish_read_timestamp(WT_SESSION_IMPL *session); @@ -1770,6 +1768,8 @@ static inline bool __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *p WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_off_page(WT_PAGE *page, const void *p) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline bool __wt_op_timer_fired(WT_SESSION_IMPL *session) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_page_del_active(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) @@ -1797,10 +1797,6 @@ static inline bool __wt_session_can_wait(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_split_descent_race(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX *saved_pindex) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -static inline bool __wt_txn_am_oldest(WT_SESSION_IMPL *session) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -static inline bool __wt_txn_upd_durable(WT_SESSION_IMPL *session, WT_UPDATE *upd) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_txn_upd_visible(WT_SESSION_IMPL *session, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline bool __wt_txn_upd_visible_all(WT_SESSION_IMPL *session, WT_UPDATE *upd) @@ -2073,6 +2069,8 @@ static inline uint64_t __wt_cell_rle(WT_CELL_UNPACK *unpack) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline uint64_t __wt_clock(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +static inline uint64_t __wt_clock_to_nsec(uint64_t end, uint64_t begin) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline uint64_t __wt_rdtsc(void) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); static inline uint64_t __wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -2110,6 +2108,8 @@ static inline void __wt_cond_wait( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *)); static inline void __wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session); static inline void __wt_cursor_dhandle_incr_use(WT_SESSION_IMPL *session); +static inline void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp); +static inline void __wt_op_timer_start(WT_SESSION_IMPL *session); static inline void __wt_page_evict_soon(WT_SESSION_IMPL *session, WT_REF *ref); static inline void __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page); static inline void __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page); @@ -2137,6 +2137,8 @@ static inline void __wt_row_leaf_value_cell(WT_SESSION_IMPL *session, WT_PAGE *p WT_CELL_UNPACK *kpack, WT_CELL_UNPACK *vpack); static inline void __wt_row_leaf_value_set(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *unpack); static inline void __wt_scr_free(WT_SESSION_IMPL *session, WT_ITEM **bufp); +static inline void __wt_seconds(WT_SESSION_IMPL *session, uint64_t *secondsp); +static inline void __wt_seconds32(WT_SESSION_IMPL *session, uint32_t *secondsp); static inline void __wt_spin_backoff(uint64_t *yield_count, uint64_t *sleep_usecs); static inline void __wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t); static inline void __wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t); diff --git a/src/third_party/wiredtiger/src/include/extern_posix.h b/src/third_party/wiredtiger/src/include/extern_posix.h index 189bc948714..a2280aefa4f 100644 --- a/src/third_party/wiredtiger/src/include/extern_posix.h +++ b/src/third_party/wiredtiger/src/include/extern_posix.h @@ -51,7 +51,8 @@ extern void __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp); extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond); extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled); -extern void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp); +extern void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp) + WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_stream_set_line_buffer(FILE *fp) diff --git a/src/third_party/wiredtiger/src/include/misc.i b/src/third_party/wiredtiger/src/include/misc.i index 7b908ac3871..e937858ba5b 100644 --- a/src/third_party/wiredtiger/src/include/misc.i +++ b/src/third_party/wiredtiger/src/include/misc.i @@ -29,49 +29,6 @@ __wt_hex(int c) return ((u_char) "0123456789abcdef"[c]); } -/* - * __wt_rdtsc -- - * Get a timestamp from CPU registers. - */ -static inline uint64_t -__wt_rdtsc(void) -{ -#if defined(__i386) - { - uint64_t x; - - __asm__ volatile("rdtsc" : "=A"(x)); - return (x); - } -#elif defined(__amd64) - { - uint64_t a, d; - - __asm__ volatile("rdtsc" : "=a"(a), "=d"(d)); - return ((d << 32) | a); - } -#else - return (0); -#endif -} - -/* - * __wt_clock -- - * Obtain a timestamp via either a CPU register or via a system call on platforms where - * obtaining it directly from the hardware register is not supported. - */ -static inline uint64_t -__wt_clock(WT_SESSION_IMPL *session) -{ - struct timespec tsp; - - if (__wt_process.use_epochtime) { - __wt_epoch(session, &tsp); - return ((uint64_t)(tsp.tv_sec * WT_BILLION + tsp.tv_nsec)); - } - return (__wt_rdtsc()); -} - /* * __wt_strdup -- * ANSI strdup function. diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h index 22f63ae4ff4..8403097e03a 100644 --- a/src/third_party/wiredtiger/src/include/reconcile.h +++ b/src/third_party/wiredtiger/src/include/reconcile.h @@ -33,12 +33,9 @@ struct __wt_reconcile { /* Track the page's min/maximum transactions. */ uint64_t max_txn; - wt_timestamp_t max_timestamp; - - /* Lookaside boundary tracking. */ - uint64_t unstable_txn; - wt_timestamp_t unstable_durable_timestamp; - wt_timestamp_t unstable_timestamp; + wt_timestamp_t max_ts; + wt_timestamp_t max_ondisk_ts; + wt_timestamp_t min_skipped_ts; u_int updates_seen; /* Count of updates seen. */ u_int updates_unstable; /* Count of updates not visible_all. */ diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index b1da78f4668..01eae24cb44 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -61,7 +61,12 @@ struct __wt_session_impl { const char *name; /* Name */ const char *lastop; /* Last operation */ uint32_t id; /* UID, offset in session array */ - uint32_t op_start; /* DEBUGGING: Operation start time (seconds) */ + + uint64_t operation_start_us; /* Operation start */ + uint64_t operation_timeout_us; /* Maximum operation period before rollback */ +#ifdef HAVE_DIAGNOSTIC + uint32_t op_5043_seconds; /* Temporary debugging to catch WT-5043, discard after 01/2020. */ +#endif WT_EVENT_HANDLER *event_handler; /* Application's event handlers */ diff --git a/src/third_party/wiredtiger/src/include/time.i b/src/third_party/wiredtiger/src/include/time.i new file mode 100644 index 00000000000..bad2f0417ad --- /dev/null +++ b/src/third_party/wiredtiger/src/include/time.i @@ -0,0 +1,182 @@ +/*- + * Copyright (c) 2014-2019 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_rdtsc -- + * Get a timestamp from CPU registers. + */ +static inline uint64_t +__wt_rdtsc(void) +{ +#if defined(__i386) + { + uint64_t x; + + __asm__ volatile("rdtsc" : "=A"(x)); + return (x); + } +#elif defined(__amd64) + { + uint64_t a, d; + + __asm__ volatile("rdtsc" : "=a"(a), "=d"(d)); + return ((d << 32) | a); + } +#else + return (0); +#endif +} + +/* + * __time_check_monotonic -- + * Check and prevent time running backward. If we detect that it has, we set the time structure + * to the previous values, making time stand still until we see a time in the future of the + * highest value seen so far. + */ +static inline void +__time_check_monotonic(WT_SESSION_IMPL *session, struct timespec *tsp) +{ + /* + * Detect time going backward. If so, use the last saved timestamp. + */ + if (session == NULL) + return; + + if (tsp->tv_sec < session->last_epoch.tv_sec || + (tsp->tv_sec == session->last_epoch.tv_sec && tsp->tv_nsec < session->last_epoch.tv_nsec)) { + WT_STAT_CONN_INCR(session, time_travel); + *tsp = session->last_epoch; + } else + session->last_epoch = *tsp; +} + +/* + * __wt_epoch -- + * Return the time since the Epoch. + */ +static inline void +__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) +{ + struct timespec tmp; + + /* + * Read into a local variable, then check for monotonically increasing time, ensuring single + * threads never see time move backward. We don't prevent multiple threads from seeing time move + * backwards (even when reading time serially, the saved last-read time is per thread, not per + * timer, so multiple threads can race the time). Nor do we prevent multiple threads + * simultaneously reading the time from seeing random time or time moving backwards (assigning + * the time structure to the returned memory location implies multicycle writes to memory). + */ + __wt_epoch_raw(session, &tmp); + __time_check_monotonic(session, &tmp); + *tsp = tmp; +} + +/* + * __wt_clock -- + * Obtain a timestamp via either a CPU register or via a system call on platforms where + * obtaining it directly from the hardware register is not supported. + */ +static inline uint64_t +__wt_clock(WT_SESSION_IMPL *session) +{ + struct timespec tsp; + + /* + * In one case we return nanoseconds, in the other we return clock ticks. That looks wrong, but + * it's not. When simply comparing before and after values, which is returned doesn't matter. + * When trying to calculate wall-clock time (that is, comparing a starting time with an ending + * time), we'll subtract the two values and then call a function to convert the result of the + * subtraction into nanoseconds. In the case where we already have nanoseconds, that function + * has a conversion constant of 1 and we'll skip the conversion, in the case where we have clock + * ticks, the conversion constant will be real. The reason is because doing it that way avoids a + * floating-point operation per wall-clock time calculation. + */ + if (__wt_process.use_epochtime) { + __wt_epoch(session, &tsp); + return ((uint64_t)(tsp.tv_sec * WT_BILLION + tsp.tv_nsec)); + } + return (__wt_rdtsc()); +} + +/* + * __wt_seconds -- + * Return the seconds since the Epoch. + */ +static inline void +__wt_seconds(WT_SESSION_IMPL *session, uint64_t *secondsp) +{ + struct timespec t; + + __wt_epoch(session, &t); + + *secondsp = (uint64_t)(t.tv_sec + t.tv_nsec / WT_BILLION); +} + +/* + * __wt_seconds32 -- + * Return the seconds since the Epoch in 32 bits. + */ +static inline void +__wt_seconds32(WT_SESSION_IMPL *session, uint32_t *secondsp) +{ + uint64_t seconds; + + /* This won't work in 2038. But for now allow it. */ + __wt_seconds(session, &seconds); + *secondsp = (uint32_t)seconds; +} + +/* + * __wt_clock_to_nsec -- + * Convert from clock ticks to nanoseconds. + */ +static inline uint64_t +__wt_clock_to_nsec(uint64_t end, uint64_t begin) +{ + double clock_diff; + + /* + * If the ticks were reset, consider it an invalid check and just return zero as the time + * difference because we cannot compute anything meaningful. + */ + if (end < begin) + return (0); + clock_diff = (double)(end - begin); + return ((uint64_t)(clock_diff / __wt_process.tsc_nsec_ratio)); +} + +/* + * __wt_op_timer_start -- + * Start the operations timer. + */ +static inline void +__wt_op_timer_start(WT_SESSION_IMPL *session) +{ + session->operation_start_us = session->operation_timeout_us == 0 ? 0 : __wt_clock(session); +} + +/* + * __wt_op_timer_fired -- + * Check the operations timers. + */ +static inline bool +__wt_op_timer_fired(WT_SESSION_IMPL *session) +{ + uint64_t diff, now; + + /* Check for both a timeout and a start time to avoid any future configuration races. */ + if (session->operation_timeout_us == 0 || session->operation_start_us == 0) + return (false); + + now = __wt_clock(session); + diff = WT_CLOCKDIFF_US(now, session->operation_start_us); + return (diff > session->operation_timeout_us); +} diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 3e5d2bfd850..6d7ead93201 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -792,19 +792,6 @@ __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd) return (WT_VISIBLE_TRUE); } -/* - * __wt_txn_upd_durable -- - * Can the current transaction make the given update durable. - */ -static inline bool -__wt_txn_upd_durable(WT_SESSION_IMPL *session, WT_UPDATE *upd) -{ - /* If update is visible then check if it is durable. */ - if (__wt_txn_upd_visible_type(session, upd) != WT_VISIBLE_TRUE) - return (false); - return (__wt_txn_visible(session, upd->txnid, upd->durable_ts)); -} - /* * __wt_txn_upd_visible -- * Can the current transaction see the given update. @@ -871,8 +858,12 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) if (session->ncursors > 0) WT_RET(__wt_session_copy_values(session)); - /* Stall here if the cache is completely full. */ - WT_RET(__wt_cache_eviction_check(session, false, true, NULL)); + /* + * Stall here if the cache is completely full. We have allocated a transaction ID which + * makes it possible for eviction to decide we're contributing to the problem and return + * WT_ROLLBACK. The WT_SESSION.begin_transaction API can't return rollback, continue on. + */ + WT_RET_ERROR_OK(__wt_cache_eviction_check(session, false, true, NULL), WT_ROLLBACK); __wt_txn_get_snapshot(session); } @@ -1144,40 +1135,6 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) __wt_txn_get_snapshot(session); } -/* - * __wt_txn_am_oldest -- - * Am I the oldest transaction in the system? - */ -static inline bool -__wt_txn_am_oldest(WT_SESSION_IMPL *session) -{ - WT_CONNECTION_IMPL *conn; - WT_TXN *txn; - WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *s; - uint64_t id; - uint32_t i, session_cnt; - - conn = S2C(session); - txn = &session->txn; - txn_global = &conn->txn_global; - - if (txn->id == WT_TXN_NONE || F_ISSET(txn, WT_TXN_PREPARE)) - return (false); - - WT_ORDERED_READ(session_cnt, conn->session_cnt); - for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) - /* - * We are checking if the transaction is oldest one in the system. It is safe to ignore any - * sessions that are allocating transaction IDs, since we already have an ID, they are - * guaranteed to be newer. - */ - if (!s->is_allocating && (id = s->id) != WT_TXN_NONE && WT_TXNID_LT(id, txn->id)) - return (false); - - return (true); -} - /* * __wt_txn_activity_check -- * Check whether there are any running transactions. diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index b9fed57f9ad..892d78b89a4 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -1723,6 +1723,12 @@ struct __wt_session { * \c "read-committed"\, \c "snapshot"; default empty.} * @config{name, name of the transaction for tracing and debugging., a string; default * empty.} + * @config{operation_timeout_ms, when non-zero\, a requested limit on the number of elapsed + * real time milliseconds taken to complete database operations in this transaction. Time + * is measured from the start of each WiredTiger API call. There is no guarantee any + * operation will not take longer than this amount of time. If WiredTiger notices the limit + * has been exceeded\, an operation may return a WT_ROLLBACK error. Default is to have no + * limit., an integer greater than or equal to 1; default \c 0.} * @config{priority, priority of the transaction for resolving conflicts. Transactions with * higher values are less likely to abort., an integer between -100 and 100; default \c 0.} * @config{read_timestamp, read using the specified timestamp. The supplied value must not @@ -1906,16 +1912,17 @@ struct __wt_session { * "to=" to drop all checkpoints before and including the named checkpoint. * Checkpoints cannot be dropped while a hot backup is in progress or if open in a cursor., * a list of strings; default empty.} - * @config{force, by default\, checkpoints may be skipped if the underlying object has not - * been modified\, this option forces the checkpoint., a boolean flag; default \c false.} + * @config{force, if false (the default)\, checkpoints may be skipped if the underlying + * object has not been modified\, if true\, this option forces the checkpoint., a boolean + * flag; default \c false.} * @config{name, if set\, specify a name for the checkpoint (note that checkpoints including * LSM trees may not be named)., a string; default empty.} * @config{target, if non-empty\, checkpoint the list of objects., a list of strings; * default empty.} - * @config{use_timestamp, by default\, create the checkpoint as of the last stable timestamp - * if timestamps are in use\, or all current updates if there is no stable timestamp set. - * If false\, this option generates a checkpoint with all updates including those later than - * the timestamp., a boolean flag; default \c true.} + * @config{use_timestamp, if true (the default)\, create the checkpoint as of the last + * stable timestamp if timestamps are in use\, or all current updates if there is no stable + * timestamp set. If false\, this option generates a checkpoint with all updates including + * those later than the timestamp., a boolean flag; default \c true.} * @configend * @errors */ @@ -2259,6 +2266,12 @@ struct __wt_connection { * database. Each worker thread uses a session handle from the configured session_max., an * integer between 3 and 20; default \c 4.} * @config{ ),,} + * @config{operation_timeout_ms, when non-zero\, a requested limit on the number of elapsed + * real time milliseconds application threads will take to complete database operations. + * Time is measured from the start of each WiredTiger API call. There is no guarantee any + * operation will not take longer than this amount of time. If WiredTiger notices the limit + * has been exceeded\, an operation may return a WT_ROLLBACK error. Default is to have no + * limit., an integer greater than or equal to 1; default \c 0.} * @config{operation_tracking = (, enable tracking of performance-critical functions. See * @ref operation_tracking for more information., a set of related configuration options * defined below.} @@ -2493,18 +2506,16 @@ struct __wt_connection { /*! * Rollback in-memory non-logged state to an earlier point in time. * - * This method uses a timestamp to define the rollback point, and thus - * requires that the application uses timestamps and that the - * stable_timestamp must have been set via a call to - * WT_CONNECTION::set_timestamp. Any updates to checkpoint durable - * tables that are more recent than the stable timestamp are removed. + * This method uses a timestamp to define the rollback point, and requires the application + * use timestamps, the stable_timestamp have been set via a call to + * WT_CONNECTION::set_timestamp, and a checkpoint operating on the last stable timestamp + * to have completed. Any updates to checkpoint durable tables that are more recent than + * the stable timestamp are removed. * - * This method requires that there are no active operations for the - * duration of the call. + * This method requires that there are no active operations for the duration of the call. * - * Any updates made to logged tables will not be rolled back. Any - * updates made without an associated timestamp will not be rolled - * back. See @ref transaction_timestamps. + * Any updates made to logged tables will not be rolled back. Any updates made without an + * associated timestamp will not be rolled back. See @ref transaction_timestamps. * * @snippet ex_all.c rollback to stable * @@ -2912,6 +2923,12 @@ struct __wt_connection { * @config{multiprocess, permit sharing between processes (will automatically start an RPC server * for primary processes and use RPC for secondary processes). Not yet supported in * WiredTiger., a boolean flag; default \c false.} + * @config{operation_timeout_ms, when non-zero\, a requested limit on the number of elapsed real + * time milliseconds application threads will take to complete database operations. Time is + * measured from the start of each WiredTiger API call. There is no guarantee any operation will + * not take longer than this amount of time. If WiredTiger notices the limit has been exceeded\, an + * operation may return a WT_ROLLBACK error. Default is to have no limit., an integer greater than + * or equal to 1; default \c 0.} * @config{operation_tracking = (, enable tracking of performance-critical functions. See @ref * operation_tracking for more information., a set of related configuration options defined below.} * @config{    enabled, enable operation tracking subsystem., a boolean flag; diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index 3bc4f02c258..2b281443f21 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -430,6 +430,7 @@ typedef uint64_t wt_timestamp_t; #include "packing.i" #include "reconcile.i" #include "serial.i" +#include "time.i" #if defined(__cplusplus) } diff --git a/src/third_party/wiredtiger/src/os_posix/os_time.c b/src/third_party/wiredtiger/src/os_posix/os_time.c index 9b4729994df..6009a532c8c 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_time.c +++ b/src/third_party/wiredtiger/src/os_posix/os_time.c @@ -14,6 +14,7 @@ */ void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp) + WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { WT_DECL_RET; diff --git a/src/third_party/wiredtiger/src/reconcile/rec_child.c b/src/third_party/wiredtiger/src/reconcile/rec_child.c index 99342d8ed94..b1d696e2ac6 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_child.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_child.c @@ -94,10 +94,7 @@ __rec_child_deleted(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, WT_C if (F_ISSET(r, WT_REC_EVICT)) return (__wt_set_return(session, EBUSY)); - /* - * If there are deleted child pages we can't discard immediately, keep the page dirty so they - * are eventually freed. - */ + /* If the page cannot be marked clean. */ r->leave_dirty = true; /* diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c index 9f3150d362b..06dcf73fbb5 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c @@ -8,6 +8,19 @@ #include "wt_internal.h" +/* + * __rec_update_durable -- + * Return whether an update is suitable for writing to a disk image. + */ +static bool +__rec_update_durable(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *upd) +{ + return (F_ISSET(r, WT_REC_VISIBLE_ALL) ? + __wt_txn_upd_visible_all(session, upd) : + __wt_txn_upd_visible_type(session, upd) == WT_VISIBLE_TRUE && + __wt_txn_visible(session, upd->txnid, upd->durable_ts)); +} + /* * __rec_update_save -- * Save a WT_UPDATE list for later restoration. @@ -111,11 +124,11 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v WT_CELL_UNPACK *vpack, WT_UPDATE_SELECT *upd_select) { WT_PAGE *page; - WT_UPDATE *first_ts_upd, *first_txn_upd, *first_upd, *upd; - wt_timestamp_t timestamp, ts; + WT_UPDATE *first_txn_upd, *first_upd, *upd; + wt_timestamp_t max_ts; size_t upd_memsize; uint64_t max_txn, txnid; - bool all_visible, list_prepared, list_uncommitted, skipped_birthmark; + bool all_stable, list_prepared, list_uncommitted, skipped_birthmark; /* * The "saved updates" return value is used independently of returning an update we can write, @@ -125,8 +138,9 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v upd_select->upd_saved = false; page = r->page; - first_ts_upd = first_txn_upd = NULL; + first_txn_upd = NULL; upd_memsize = 0; + max_ts = WT_TS_NONE; max_txn = WT_TXN_NONE; list_prepared = list_uncommitted = skipped_birthmark = false; @@ -152,8 +166,6 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v */ if (first_txn_upd == NULL) first_txn_upd = upd; - - /* Track the largest transaction ID seen. */ if (WT_TXNID_LT(max_txn, txnid)) max_txn = txnid; @@ -170,21 +182,23 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v * prepared transaction IDs are globally visible, need to check the update state as well. */ if (F_ISSET(r, WT_REC_EVICT)) { - if (upd->prepare_state == WT_PREPARE_LOCKED || - upd->prepare_state == WT_PREPARE_INPROGRESS) { - list_prepared = true; - continue; - } if (F_ISSET(r, WT_REC_VISIBLE_ALL) ? WT_TXNID_LE(r->last_running, txnid) : !__txn_visible_id(session, txnid)) { r->update_uncommitted = list_uncommitted = true; continue; } + if (upd->prepare_state == WT_PREPARE_LOCKED || + upd->prepare_state == WT_PREPARE_INPROGRESS) { + list_prepared = true; + if (upd->start_ts > max_ts) + max_ts = upd->start_ts; + continue; + } } /* Track the first update with non-zero timestamp. */ - if (first_ts_upd == NULL && upd->start_ts != WT_TS_NONE) - first_ts_upd = upd; + if (upd->durable_ts > max_ts) + max_ts = upd->durable_ts; /* * Select the update to write to the disk image. @@ -202,8 +216,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v if (upd_select->upd == NULL && r->las_skew_newest) upd_select->upd = upd; - if (F_ISSET(r, WT_REC_VISIBLE_ALL) ? !__wt_txn_upd_visible_all(session, upd) : - !__wt_txn_upd_durable(session, upd)) { + if (!__rec_update_durable(session, r, upd)) { if (F_ISSET(r, WT_REC_EVICT)) ++r->updates_unstable; @@ -214,21 +227,29 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v * discard an uncommitted update. */ if (F_ISSET(r, WT_REC_UPDATE_RESTORE) && upd_select->upd != NULL && - (list_prepared || list_uncommitted)) { - r->leave_dirty = true; + (list_prepared || list_uncommitted)) return (__wt_set_return(session, EBUSY)); - } if (upd->type == WT_UPDATE_BIRTHMARK) skipped_birthmark = true; + /* + * Track the oldest update not on the page. + * + * This is used to decide whether reads can use the + * page image, hence using the start rather than the + * durable timestamp. + */ + if (upd_select->upd == NULL && upd->start_ts < r->min_skipped_ts) + r->min_skipped_ts = upd->start_ts; + continue; } /* * Lookaside without stable timestamp was taken care of above - * (set to the first uncommitted transaction). Lookaside with - * stable timestamp always takes the first stable update. + * (set to the first uncommitted transaction). All other + * reconciliation takes the first stable update. */ if (upd_select->upd == NULL) upd_select->upd = upd; @@ -262,6 +283,9 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v if (upd == first_txn_upd) r->update_used = true; + if (upd != NULL && upd->durable_ts > r->max_ondisk_ts) + r->max_ondisk_ts = upd->durable_ts; + /* * TIMESTAMP-FIXME The start timestamp is determined by the commit timestamp when the key is * first inserted (or last updated). The end timestamp is set when a key/value pair becomes @@ -308,8 +332,8 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v r->max_txn = max_txn; /* Update the maximum timestamp. */ - if (first_ts_upd != NULL && r->max_timestamp < first_ts_upd->durable_ts) - r->max_timestamp = first_ts_upd->durable_ts; + if (max_ts > r->max_ts) + r->max_ts = max_ts; /* * If the update we chose was a birthmark, or we are doing update-restore and we skipped a @@ -327,19 +351,15 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v } /* - * Check if all updates on the page are visible. If not, it must stay - * dirty unless we are saving updates to the lookaside table. + * Check if all updates on the page are visible, if not, it must stay dirty. * - * Updates can be out of transaction ID order (but not out of timestamp - * order), so we track the maximum transaction ID and the newest update - * with a timestamp (if any). + * Updates can be out of transaction ID order (but not out of timestamp order), so we track the + * maximum transaction ID and the newest update with a timestamp (if any). */ - timestamp = first_ts_upd == NULL ? 0 : first_ts_upd->durable_ts; - all_visible = upd == first_txn_upd && !list_prepared && !list_uncommitted && - (F_ISSET(r, WT_REC_VISIBLE_ALL) ? __wt_txn_visible_all(session, max_txn, timestamp) : - __wt_txn_visible(session, max_txn, timestamp)); + all_stable = upd == first_txn_upd && !list_prepared && !list_uncommitted && + __wt_txn_visible_all(session, max_txn, max_ts); - if (all_visible) + if (all_stable) goto check_original_value; r->leave_dirty = true; @@ -347,9 +367,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v if (F_ISSET(r, WT_REC_VISIBILITY_ERR)) WT_PANIC_RET(session, EINVAL, "reconciliation error, update not visible"); - /* - * If not trying to evict the page, we know what we'll write and we're done. - */ + /* If not trying to evict the page, we know what we'll write and we're done. */ if (!F_ISSET(r, WT_REC_EVICT)) goto check_original_value; @@ -382,54 +400,6 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, v WT_RET(__rec_update_save(session, r, ins, ripcip, upd_select->upd, upd_memsize)); upd_select->upd_saved = true; - /* - * Track the first off-page update when saving history in the lookaside table. When skewing - * newest, we want the first (non-aborted) update after the one stored on the page. Otherwise, - * we want the update before the on-page update. - */ - if (F_ISSET(r, WT_REC_LOOKASIDE) && r->las_skew_newest) { - if (WT_TXNID_LT(r->unstable_txn, first_upd->txnid)) - r->unstable_txn = first_upd->txnid; - if (first_ts_upd != NULL) { - WT_ASSERT(session, first_ts_upd->prepare_state == WT_PREPARE_INPROGRESS || - first_ts_upd->start_ts <= first_ts_upd->durable_ts); - - if (r->unstable_timestamp < first_ts_upd->start_ts) - r->unstable_timestamp = first_ts_upd->start_ts; - - if (r->unstable_durable_timestamp < first_ts_upd->durable_ts) - r->unstable_durable_timestamp = first_ts_upd->durable_ts; - } - } else if (F_ISSET(r, WT_REC_LOOKASIDE)) { - for (upd = first_upd; upd != upd_select->upd; upd = upd->next) { - if (upd->txnid == WT_TXN_ABORTED) - continue; - - if (upd->txnid != WT_TXN_NONE && WT_TXNID_LT(upd->txnid, r->unstable_txn)) - r->unstable_txn = upd->txnid; - - /* - * The durable timestamp is always set by commit, and usually the same as the start - * timestamp, which makes it OK to use the two independently and be confident both will - * be set. - */ - WT_ASSERT(session, - upd->prepare_state == WT_PREPARE_INPROGRESS || upd->durable_ts >= upd->start_ts); - - if (r->unstable_timestamp > upd->start_ts) - r->unstable_timestamp = upd->start_ts; - - /* - * An in-progress prepared update will always have a zero durable timestamp. Checkpoints - * can only skip reading lookaside history if all updates are in the future, including - * the prepare, so including the prepare timestamp instead. - */ - ts = upd->prepare_state == WT_PREPARE_INPROGRESS ? upd->start_ts : upd->durable_ts; - if (r->unstable_durable_timestamp > ts) - r->unstable_durable_timestamp = ts; - } - } - check_original_value: /* * Paranoia: check that we didn't choose an update that has since been rolled back. diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 6bd67f329e1..26b1849693a 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -404,7 +404,7 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r) * discard its history). */ mod->rec_max_txn = r->max_txn; - mod->rec_max_timestamp = r->max_timestamp; + mod->rec_max_timestamp = r->max_ts; /* * Track the tree's maximum transaction ID (used to decide if it's safe to discard the @@ -416,8 +416,8 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r) if (!F_ISSET(r, WT_REC_EVICT)) { if (WT_TXNID_LT(btree->rec_max_txn, r->max_txn)) btree->rec_max_txn = r->max_txn; - if (btree->rec_max_timestamp < r->max_timestamp) - btree->rec_max_timestamp = r->max_timestamp; + if (btree->rec_max_timestamp < r->max_ts) + btree->rec_max_timestamp = r->max_ts; } /* @@ -651,22 +651,8 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO /* Track the page's min/maximum transaction */ r->max_txn = WT_TXN_NONE; - r->max_timestamp = 0; - - /* - * Track the first unstable transaction (when skewing newest this is the newest update, - * otherwise the newest update not on the page). This is the boundary between the on-page - * information and the history stored in the lookaside table. - */ - if (r->las_skew_newest) { - r->unstable_txn = WT_TXN_NONE; - r->unstable_timestamp = WT_TS_NONE; - r->unstable_durable_timestamp = WT_TS_NONE; - } else { - r->unstable_txn = WT_TXN_ABORTED; - r->unstable_timestamp = WT_TS_MAX; - r->unstable_durable_timestamp = WT_TS_MAX; - } + r->max_ondisk_ts = r->max_ts = WT_TS_NONE; + r->min_skipped_ts = WT_TS_MAX; /* Track if updates were used and/or uncommitted. */ r->updates_seen = r->updates_unstable = 0; @@ -1649,17 +1635,9 @@ __rec_split_write_supd( done: if (F_ISSET(r, WT_REC_LOOKASIDE)) { /* Track the oldest lookaside timestamp seen so far. */ - multi->page_las.skew_newest = r->las_skew_newest; multi->page_las.max_txn = r->max_txn; - multi->page_las.unstable_txn = r->unstable_txn; - WT_ASSERT(session, r->unstable_txn != WT_TXN_NONE); - multi->page_las.max_timestamp = r->max_timestamp; - - WT_ASSERT(session, r->all_upd_prepare_in_prog == true || - r->unstable_durable_timestamp >= r->unstable_timestamp); - - multi->page_las.unstable_timestamp = r->unstable_timestamp; - multi->page_las.unstable_durable_timestamp = r->unstable_durable_timestamp; + multi->page_las.max_ondisk_ts = r->max_ondisk_ts; + multi->page_las.min_skipped_ts = r->min_skipped_ts; } err: diff --git a/src/third_party/wiredtiger/src/support/time.c b/src/third_party/wiredtiger/src/support/time.c deleted file mode 100644 index 61cebb71b51..00000000000 --- a/src/third_party/wiredtiger/src/support/time.c +++ /dev/null @@ -1,109 +0,0 @@ -/*- - * Copyright (c) 2014-2019 MongoDB, Inc. - * Copyright (c) 2008-2014 WiredTiger, Inc. - * All rights reserved. - * - * See the file LICENSE for redistribution information. - */ - -#include "wt_internal.h" - -/* - * __time_check_monotonic -- - * Check and prevent time running backward. If we detect that it has, we set the time structure - * to the previous values, making time stand still until we see a time in the future of the - * highest value seen so far. - */ -static void -__time_check_monotonic(WT_SESSION_IMPL *session, struct timespec *tsp) -{ - /* - * Detect time going backward. If so, use the last saved timestamp. - */ - if (session == NULL) - return; - - if (tsp->tv_sec < session->last_epoch.tv_sec || - (tsp->tv_sec == session->last_epoch.tv_sec && tsp->tv_nsec < session->last_epoch.tv_nsec)) { - WT_STAT_CONN_INCR(session, time_travel); - *tsp = session->last_epoch; - } else - session->last_epoch = *tsp; -} - -/* - * __wt_epoch -- - * Return the time since the Epoch. - */ -void -__wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) - WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) -{ - struct timespec tmp; - - /* - * Read into a local variable, then check for monotonically increasing time, ensuring single - * threads never see time move backward. We don't prevent multiple threads from seeing time move - * backwards (even when reading time serially, the saved last-read time is per thread, not per - * timer, so multiple threads can race the time). Nor do we prevent multiple threads - * simultaneously reading the time from seeing random time or time moving backwards (assigning - * the time structure to the returned memory location implies multicycle writes to memory). - */ - __wt_epoch_raw(session, &tmp); - __time_check_monotonic(session, &tmp); - *tsp = tmp; -} - -/* - * __wt_seconds -- - * Return the seconds since the Epoch. - */ -void -__wt_seconds(WT_SESSION_IMPL *session, uint64_t *secondsp) - WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) -{ - struct timespec t; - - __wt_epoch(session, &t); - - /* - * A time_t isn't guaranteed to fit into a uint64_t, but it's asserted when WiredTiger builds. - */ - *secondsp = (uint64_t)t.tv_sec; -} - -/* - * __wt_seconds32 -- - * Return the seconds since the Epoch in 32 bits. - */ -void -__wt_seconds32(WT_SESSION_IMPL *session, uint32_t *secondsp) -{ - struct timespec t; - - __wt_epoch(session, &t); - - /* - * This won't work in 2038. But for now allow it. - */ - *secondsp = (uint32_t)t.tv_sec; -} - -/* - * __wt_clock_to_nsec -- - * Convert from clock ticks to nanoseconds. - */ -uint64_t -__wt_clock_to_nsec(uint64_t end, uint64_t begin) -{ - double clock_diff; - - /* - * If the ticks were reset, consider it an invalid check and just return zero as the time - * difference because we cannot compute anything meaningful. - */ - if (end < begin) - return (0); - clock_diff = (double)(end - begin); - return ((uint64_t)(clock_diff / __wt_process.tsc_nsec_ratio)); -} diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 0374732dfa7..09caef4345e 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -468,6 +468,12 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_STRING_MATCH("read-committed", cval.str, cval.len) ? WT_ISO_READ_COMMITTED : WT_ISO_READ_UNCOMMITTED; + /* Retrieve the maximum operation time, defaulting to the database-wide configuration. */ + WT_RET(__wt_config_gets(session, cfg, "operation_timeout_ms", &cval)); + session->operation_timeout_us = (uint64_t)(cval.val * WT_THOUSAND); + if (session->operation_timeout_us == 0) + session->operation_timeout_us = S2C(session)->operation_timeout_us; + /* * The default sync setting is inherited from the connection, but can * be overridden by an explicit "sync" setting for this transaction. @@ -615,6 +621,9 @@ __wt_txn_release(WT_SESSION_IMPL *session) */ txn->flags = 0; txn->prepare_timestamp = WT_TS_NONE; + + /* Clear operation timer. */ + session->operation_timeout_us = 0; } /* @@ -1501,19 +1510,143 @@ __wt_txn_activity_drain(WT_SESSION_IMPL *session) * __wt_txn_global_shutdown -- * Shut down the global transaction state. */ -void -__wt_txn_global_shutdown(WT_SESSION_IMPL *session) +int +__wt_txn_global_shutdown(WT_SESSION_IMPL *session, const char *config, const char **cfg) { + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION *wt_session; + WT_SESSION_IMPL *s; + const char *ckpt_cfg; + + conn = S2C(session); + + /* + * Perform a system-wide checkpoint so that all tables are consistent with each other. All + * transactions are resolved but ignore timestamps to make sure all data gets to disk. Do this + * before shutting down all the subsystems. We have shut down all user sessions, but send in + * true for waiting for internal races. + */ + WT_TRET(__wt_config_gets(session, cfg, "use_timestamp", &cval)); + ckpt_cfg = "use_timestamp=false"; + if (cval.val != 0) { + ckpt_cfg = "use_timestamp=true"; + if (conn->txn_global.has_stable_timestamp) + F_SET(conn, WT_CONN_CLOSING_TIMESTAMP); + } + if (!F_ISSET(conn, WT_CONN_IN_MEMORY | WT_CONN_READONLY)) { + s = NULL; + WT_TRET(__wt_open_internal_session(conn, "close_ckpt", true, 0, &s)); + if (s != NULL) { + const char *checkpoint_cfg[] = { + WT_CONFIG_BASE(session, WT_SESSION_checkpoint), ckpt_cfg, NULL}; + wt_session = &s->iface; + WT_TRET(__wt_txn_checkpoint(s, checkpoint_cfg, true)); + + /* + * Mark the metadata dirty so we flush it on close, allowing recovery to be skipped. + */ + WT_WITH_DHANDLE(s, WT_SESSION_META_DHANDLE(s), __wt_tree_modify_set(s)); + + WT_TRET(wt_session->close(wt_session, config)); + } + } + /* - * All application transactions have completed, ignore the pinned - * timestamp so that updates can be evicted from the cache during - * connection close. + * All application transactions have completed, ignore the pinned timestamp so that updates can + * be evicted from the cache during connection close. * - * Note that we are relying on a special case in __wt_txn_visible_all - * that returns true during close when there is no pinned timestamp - * set. + * Note that we are relying on a special case in __wt_txn_visible_all that returns true during + * close when there is no pinned timestamp set. */ - S2C(session)->txn_global.has_pinned_timestamp = false; + conn->txn_global.has_pinned_timestamp = false; + + return (ret); +} + +/* + * __wt_txn_is_blocking_old -- + * Return if this transaction is the oldest transaction in the system, called by eviction to + * determine if a worker thread should be released from eviction. + */ +int +__wt_txn_is_blocking_old(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *state; + uint64_t id; + uint32_t i, session_cnt; + + conn = S2C(session); + txn = &session->txn; + txn_global = &conn->txn_global; + + if (txn->id == WT_TXN_NONE || F_ISSET(txn, WT_TXN_PREPARE)) + return (false); + + WT_ORDERED_READ(session_cnt, conn->session_cnt); + + /* + * Check if the transaction is oldest one in the system. It's safe to ignore sessions allocating + * transaction IDs, since we already have an ID, they are guaranteed to be newer. + */ + for (i = 0, state = txn_global->states; i < session_cnt; i++, state++) { + if (state->is_allocating) + continue; + + WT_ORDERED_READ(id, state->id); + if (id != WT_TXN_NONE && WT_TXNID_LT(id, txn->id)) + break; + } + return (i == session_cnt ? + __wt_txn_rollback_required(session, "oldest transaction ID rolled back for eviction") : + 0); +} + +/* + * __wt_txn_is_blocking_pin -- + * Return if this transaction is likely blocking eviction because of a pinned transaction ID, + * called by eviction to determine if a worker thread should be released from eviction. + */ +int +__wt_txn_is_blocking_pin(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_SESSION_IMPL *s; + WT_TXN *txn; + uint64_t snap_min; + uint32_t i, session_cnt; + + conn = S2C(session); + txn = &session->txn; + + /* + * Check if we hold the oldest pinned transaction ID in the system. This potentially means + * rolling back a read-only transaction, which MongoDB can't (yet) handle. For this reason, + * don't check unless we're configured to time out thread operations, a way to confirm our + * caller is prepared for rollback. + */ + if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) || txn->snap_min == WT_TXN_NONE) + return (0); + if (!__wt_op_timer_fired(session)) + return (0); + + WT_ORDERED_READ(session_cnt, conn->session_cnt); + + for (s = conn->sessions, i = 0; i < session_cnt; ++s, ++i) { + if (F_ISSET(s, WT_SESSION_INTERNAL) || !F_ISSET(&s->txn, WT_TXN_HAS_SNAPSHOT)) + continue; + + WT_ORDERED_READ(snap_min, s->txn.snap_min); + if (snap_min != WT_TXN_NONE && snap_min < txn->snap_min) + break; + } + return (i == session_cnt ? __wt_txn_rollback_required( + session, "oldest pinned transaction ID rolled back for eviction") : + 0); } /* diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 072406a25cc..ccfd378b3b7 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -618,7 +618,7 @@ __checkpoint_prepare(WT_SESSION_IMPL *session, bool *trackingp, const char *cfg[ } else if (!F_ISSET(conn, WT_CONN_RECOVERING)) txn_global->meta_ckpt_timestamp = txn_global->recovery_timestamp; } else if (!F_ISSET(conn, WT_CONN_RECOVERING)) - txn_global->meta_ckpt_timestamp = 0; + txn_global->meta_ckpt_timestamp = WT_TS_NONE; __wt_writeunlock(session, &txn_global->rwlock); @@ -949,13 +949,26 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) __checkpoint_stats(session); /* - * If timestamps were used to define the content of the checkpoint update the saved last - * checkpoint timestamp, otherwise leave it alone. If a checkpoint is taken without - * timestamps, it's likely a bug, but we don't want to clear the saved last checkpoint - * timestamp regardless. + * If timestamps defined the checkpoint's content, set the saved last checkpoint timestamp, + * otherwise clear it. We clear it for a couple of reasons: applications can query it and we + * don't want to lie, and we use it to decide if WT_CONNECTION.rollback_to_stable is an + * allowed operation. For the same reason, don't set it to WT_TS_NONE when the checkpoint + * timestamp is WT_TS_NONE, set it to 1 so we can tell the difference. */ - if (use_timestamp) - conn->txn_global.last_ckpt_timestamp = ckpt_tmp_ts; + if (use_timestamp) { + conn->txn_global.last_ckpt_timestamp = use_timestamp ? ckpt_tmp_ts : WT_TS_NONE; + /* + * MongoDB assumes the checkpoint timestamp will be initialized with WT_TS_NONE. In such + * cases it queries the recovery timestamp to determine the last stable recovery + * timestamp. So, if the recovery timestamp is valid, set the last checkpoint timestamp + * to recovery timestamp. This should never be a problem, as checkpoint timestamp should + * never be less than recovery timestamp. This could potentially avoid MongoDB making + * two calls to determine last stable recovery timestamp. + */ + if (conn->txn_global.last_ckpt_timestamp == WT_TS_NONE) + conn->txn_global.last_ckpt_timestamp = conn->txn_global.recovery_timestamp; + } else + conn->txn_global.last_ckpt_timestamp = WT_TS_NONE; } err: diff --git a/src/third_party/wiredtiger/src/txn/txn_recover.c b/src/third_party/wiredtiger/src/txn/txn_recover.c index 17e0b61c904..6ccb7625108 100644 --- a/src/third_party/wiredtiger/src/txn/txn_recover.c +++ b/src/third_party/wiredtiger/src/txn/txn_recover.c @@ -536,7 +536,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session) r.session = session; WT_MAX_LSN(&r.max_ckpt_lsn); WT_MAX_LSN(&r.max_rec_lsn); - conn->txn_global.recovery_timestamp = conn->txn_global.meta_ckpt_timestamp = 0; + conn->txn_global.recovery_timestamp = conn->txn_global.meta_ckpt_timestamp = WT_TS_NONE; F_SET(conn, WT_CONN_RECOVERING); WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 97c83c47414..0b2ec12a47a 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -222,21 +222,20 @@ __txn_abort_newer_updates(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t { WT_DECL_RET; WT_PAGE *page; + WT_PAGE_LOOKASIDE *page_las; uint32_t read_flags; bool local_read; /* - * If we created a page image with updates the need to be rolled back, + * If we created a page image with updates that need to be rolled back, * read the history into cache now and make sure the page is marked * dirty. Otherwise, the history we need could be swept from the * lookaside table before the page is read because the lookaside sweep * code has no way to tell that the page image is invalid. * * So, if there is lookaside history for a page, first check if the - * history needs to be rolled back make sure that history is loaded - * into cache. That is, if skew_newest is true, so the disk image - * potentially contained unstable updates, and the history is more - * recent than the rollback timestamp. + * history needs to be rolled back then ensure the history is loaded + * into cache. * * Also, we have separately discarded any lookaside history more recent * than the rollback timestamp. For page_las structures in cache, @@ -247,9 +246,8 @@ __txn_abort_newer_updates(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t */ local_read = false; read_flags = WT_READ_WONT_NEED; - if (ref->page_las != NULL) { - if (ref->page_las->skew_newest && - rollback_timestamp < ref->page_las->unstable_durable_timestamp) { + if ((page_las = ref->page_las) != NULL) { + if (rollback_timestamp < page_las->max_ondisk_ts) { /* * Make sure we get back a page with history, not a limbo page. */ @@ -258,13 +256,10 @@ __txn_abort_newer_updates(WT_SESSION_IMPL *session, WT_REF *ref, wt_timestamp_t WT_ASSERT(session, ref->state != WT_REF_LIMBO && ref->page != NULL && __wt_page_is_modified(ref->page)); local_read = true; + page_las->max_ondisk_ts = rollback_timestamp; } - if (ref->page_las->max_timestamp > rollback_timestamp) - ref->page_las->max_timestamp = rollback_timestamp; - if (ref->page_las->unstable_durable_timestamp > rollback_timestamp) - ref->page_las->unstable_durable_timestamp = rollback_timestamp; - if (ref->page_las->unstable_timestamp > rollback_timestamp) - ref->page_las->unstable_timestamp = rollback_timestamp; + if (rollback_timestamp < page_las->min_skipped_ts) + page_las->min_skipped_ts = rollback_timestamp; } /* Review deleted page saved to the ref */ @@ -436,8 +431,10 @@ __txn_rollback_to_stable_check(WT_SESSION_IMPL *session) conn = S2C(session); txn_global = &conn->txn_global; - if (!txn_global->has_stable_timestamp) - WT_RET_MSG(session, EINVAL, "rollback_to_stable requires a stable timestamp"); + + if (!txn_global->has_stable_timestamp || txn_global->last_ckpt_timestamp == WT_TS_NONE) + WT_RET_MSG( + session, EINVAL, "rollback_to_stable requires a checkpoint with a stable timestamp"); /* * Help the user comply with the requirement that there are no concurrent operations. Protect diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml index bf20d7568bc..c6131ac0e77 100755 --- a/src/third_party/wiredtiger/test/evergreen.yml +++ b/src/third_party/wiredtiger/test/evergreen.yml @@ -54,7 +54,8 @@ functions: set -o errexit set -o verbose if [ "Windows_NT" = "$OS" ]; then - scons.bat --enable-python=c:\\swigwin-3.0.2\\swig.exe --enable-diagnostic ${smp_command|} + pip install scons==3.1.1 + scons-3.1.1.bat --enable-python=c:\\swigwin-3.0.2\\swig.exe --enable-diagnostic ${smp_command|} else cd build_posix sh ./reconf @@ -991,7 +992,8 @@ tasks: set -o errexit set -o verbose - scons.bat ${smp_command|} "CFLAGS=/Gv /wd4090 /wd4996 /we4047 /we4024 /TC /we4100 /we4133" wiredtiger.dll libwiredtiger.lib + pip install scons==3.1.1 + scons-3.1.1.bat ${smp_command|} "CFLAGS=/Gv /wd4090 /wd4996 /we4047 /we4024 /TC /we4100 /we4133" wiredtiger.dll libwiredtiger.lib - name: fops depends_on: @@ -1025,9 +1027,6 @@ tasks: cmd.exe /c "cd test\\format && ..\\..\\t_format.exe reverse=0 encryption=none logging_compression=none runs=20" - name: million-collection-test - depends_on: [] - run_on: - - rhel62-large commands: - func: "fetch source" - func: "fetch mongo-tests repo" @@ -1035,6 +1034,7 @@ tasks: params: working_dir: mongo-tests script: | + sudo su set -o errexit set -o verbose ulimit -n 1000000 @@ -1176,10 +1176,10 @@ tasks: buildvariants: -- name: ubuntu1404 - display_name: Ubuntu 14.04 +- name: ubuntu1804 + display_name: Ubuntu 18.04 run_on: - - ubuntu1404-test + - ubuntu1804-test expansions: # It's ugly, but we need the absolute path here, not the relative test_env_vars: PATH=/opt/mongodbtoolchain/v3/bin:$PATH LD_LIBRARY_PATH=$(pwd)/.libs top_srcdir=$(pwd)/.. top_builddir=$(pwd) @@ -1244,10 +1244,10 @@ buildvariants: - name: unit-test-bucket07 - name: fops -- name: ubuntu1404-python3 - display_name: Ubuntu 14.04 (Python3) +- name: ubuntu1804-python3 + display_name: Ubuntu 18.04 (Python3) run_on: - - ubuntu1404-test + - ubuntu1804-test expansions: test_env_vars: PATH=/opt/mongodbtoolchain/v3/bin:$PATH LD_LIBRARY_PATH=$(pwd)/.libs top_srcdir=$(pwd)/.. top_builddir=$(pwd) smp_command: -j $(grep -c ^processor /proc/cpuinfo) @@ -1270,7 +1270,7 @@ buildvariants: display_name: Large scale testing batchtime: 1440 # 1 day run_on: - - rhel62-large + - rhel80-build expansions: configure_env_vars: CC=/opt/mongodbtoolchain/v3/bin/gcc CXX=/opt/mongodbtoolchain/v3/bin/g++ tasks: @@ -1280,14 +1280,14 @@ buildvariants: display_name: Compatibility tests batchtime: 10080 # 7 days run_on: - - ubuntu1404-test + - ubuntu1804-test tasks: - name: compatibility-test-for-mongodb-releases - name: windows-64 display_name: Windows 64-bit run_on: - - windows-64-vs2013-test + - windows-64-vs2017-test tasks: - name: compile - name: compile-windows-alt @@ -1337,7 +1337,7 @@ buildvariants: modules: - enterprise run_on: - - ubuntu1604-zseries-small + - ubuntu1804-zseries-build batchtime: 10080 # 7 days expansions: smp_command: -j $(grep -c ^processor /proc/cpuinfo) diff --git a/src/third_party/wiredtiger/test/format/Makefile.am b/src/third_party/wiredtiger/test/format/Makefile.am index da55ffece4c..2d9bbf21eb8 100644 --- a/src/third_party/wiredtiger/test/format/Makefile.am +++ b/src/third_party/wiredtiger/test/format/Makefile.am @@ -4,7 +4,7 @@ AM_CPPFLAGS +=-I$(top_srcdir)/test/utility noinst_PROGRAMS = t t_SOURCES =\ - backup.c bulk.c compact.c config.c lrt.c ops.c rebalance.c \ + backup.c bulk.c compact.c config.c lrt.c ops.c random.c rebalance.c \ salvage.c snap.c t.c util.c wts.c t_LDADD = $(top_builddir)/test/utility/libtest_util.la diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h index 58decce75af..492d5124a1c 100644 --- a/src/third_party/wiredtiger/test/format/config.h +++ b/src/third_party/wiredtiger/test/format/config.h @@ -222,6 +222,9 @@ static CONFIG c[] = {{"abort", "if timed run should drop core", /* 0% */ {"quiet", "quiet run (same as -q)", C_IGNORE | C_BOOL, 0, 0, 1, &g.c_quiet, NULL}, + {"random_cursor", "if random cursor reads configured", /* 10% */ + C_BOOL, 10, 0, 0, &g.c_random_cursor, NULL}, + {"read_pct", "percent operations that are reads", C_IGNORE, 0, 0, 100, &g.c_read_pct, NULL}, {"rebalance", "rebalance testing", /* 100% */ diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index e90bbf86998..890f03c845c 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -178,6 +178,7 @@ typedef struct { uint32_t c_prefix_compression_min; uint32_t c_prepare; uint32_t c_quiet; + uint32_t c_random_cursor; uint32_t c_read_pct; uint32_t c_rebalance; uint32_t c_repeat_data_pct; @@ -345,6 +346,7 @@ void key_gen_insert(WT_RAND_STATE *, WT_ITEM *, uint64_t); void key_gen_teardown(WT_ITEM *); void key_init(void); WT_THREAD_RET lrt(void *); +WT_THREAD_RET random_kv(void *); void path_setup(const char *); int read_row_worker(WT_CURSOR *, uint64_t, WT_ITEM *, WT_ITEM *, bool); uint32_t rng(WT_RAND_STATE *); diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index a03b42e427b..6f5e7943c83 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -87,7 +87,7 @@ wts_ops(bool lastrun) TINFO *tinfo, total; WT_CONNECTION *conn; WT_SESSION *session; - wt_thread_t alter_tid, backup_tid, checkpoint_tid, compact_tid, lrt_tid; + wt_thread_t alter_tid, backup_tid, checkpoint_tid, compact_tid, lrt_tid, random_tid; wt_thread_t timestamp_tid; int64_t fourths, quit_fourths, thread_ops; uint32_t i; @@ -101,6 +101,7 @@ wts_ops(bool lastrun) memset(&checkpoint_tid, 0, sizeof(checkpoint_tid)); memset(&compact_tid, 0, sizeof(compact_tid)); memset(&lrt_tid, 0, sizeof(lrt_tid)); + memset(&random_tid, 0, sizeof(random_tid)); memset(×tamp_tid, 0, sizeof(timestamp_tid)); modify_repl_init(); @@ -183,6 +184,8 @@ wts_ops(bool lastrun) testutil_check(__wt_thread_create(NULL, &compact_tid, compact, NULL)); if (!SINGLETHREADED && g.c_long_running_txn) testutil_check(__wt_thread_create(NULL, &lrt_tid, lrt, NULL)); + if (g.c_random_cursor) + testutil_check(__wt_thread_create(NULL, &random_tid, random_kv, NULL)); if (g.c_txn_timestamps) testutil_check(__wt_thread_create(NULL, ×tamp_tid, timestamp, tinfo_list)); @@ -267,6 +270,8 @@ wts_ops(bool lastrun) testutil_check(__wt_thread_join(NULL, &compact_tid)); if (!SINGLETHREADED && g.c_long_running_txn) testutil_check(__wt_thread_join(NULL, &lrt_tid)); + if (g.c_random_cursor) + testutil_check(__wt_thread_join(NULL, &random_tid)); if (g.c_txn_timestamps) testutil_check(__wt_thread_join(NULL, ×tamp_tid)); g.workers_finished = false; diff --git a/src/third_party/wiredtiger/test/format/random.c b/src/third_party/wiredtiger/test/format/random.c new file mode 100644 index 00000000000..131cb0bd258 --- /dev/null +++ b/src/third_party/wiredtiger/test/format/random.c @@ -0,0 +1,95 @@ +/*- + * Public Domain 2014-2019 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "format.h" + +/* + * random_kv -- + * Do random cursor operations. + */ +WT_THREAD_RET +random_kv(void *arg) +{ + WT_CONNECTION *conn; + WT_CURSOR *cursor; + WT_DECL_RET; + WT_ITEM key, value; + WT_SESSION *session; + uint32_t i; + u_int period; + const char *config; + bool simple; + + (void)(arg); /* Unused parameter */ + + conn = g.wts_conn; + + /* Random cursor ops are only supported on row-store. */ + if (g.type != ROW) + return (WT_THREAD_RET_VALUE); + + /* Open a session. */ + testutil_check(conn->open_session(conn, NULL, NULL, &session)); + + for (simple = false;;) { + /* Alternate between simple random cursors and sample-size random cursors. */ + config = simple ? "next_random=true" : "next_random=true,next_random_sample_size=37"; + simple = !simple; + + /* + * open_cursor can return EBUSY if concurrent with a metadata operation, retry in that case. + */ + while ((ret = session->open_cursor(session, g.uri, NULL, config, &cursor)) == EBUSY) + __wt_yield(); + testutil_check(ret); + + /* This is just a smoke-test, get some key/value pairs. */ + for (i = mmrand(NULL, 0, 1000); i > 0; --i) { + testutil_check(cursor->next(cursor)); + testutil_check(cursor->get_key(cursor, &key)); + testutil_check(cursor->get_value(cursor, &value)); + } + + testutil_check(cursor->close(cursor)); + + /* Sleep for some number of seconds. */ + period = mmrand(NULL, 1, 10); + + /* Sleep for short periods so we don't make the run wait. */ + while (period > 0 && !g.workers_finished) { + --period; + __wt_sleep(1, 0); + } + if (g.workers_finished) + break; + } + + testutil_check(session->close(session, NULL)); + + return (WT_THREAD_RET_VALUE); +} diff --git a/src/third_party/wiredtiger/test/format/wts.c b/src/third_party/wiredtiger/test/format/wts.c index 89a72f090e7..f3482861573 100644 --- a/src/third_party/wiredtiger/test/format/wts.c +++ b/src/third_party/wiredtiger/test/format/wts.c @@ -162,7 +162,8 @@ wts_open(const char *home, bool set_api, WT_CONNECTION **connp) ",cache_size=%" PRIu32 "MB" ",checkpoint_sync=false" - ",error_prefix=\"%s\"", + ",error_prefix=\"%s\"" + ",operation_timeout_ms=2000", g.c_cache, progname); /* In-memory configuration. */ diff --git a/src/third_party/wiredtiger/test/suite/test_debug_mode05.py b/src/third_party/wiredtiger/test/suite/test_debug_mode05.py index f248a05e646..09597e7a38f 100644 --- a/src/third_party/wiredtiger/test/suite/test_debug_mode05.py +++ b/src/third_party/wiredtiger/test/suite/test_debug_mode05.py @@ -43,9 +43,11 @@ class test_debug_mode05(wttest.WiredTigerTestCase): def test_table_logging_rollback_to_stable(self): self.session.create(self.uri, 'key_format=i,value_format=u') + cursor = self.session.open_cursor(self.uri, None) self.conn.set_timestamp('stable_timestamp=' + timestamp_str(100)) + self.session.checkpoint() # Try doing a normal prepared txn and then rollback to stable. self.session.begin_transaction() diff --git a/src/third_party/wiredtiger/test/suite/test_las01.py b/src/third_party/wiredtiger/test/suite/test_las01.py index 76f19b51768..679d01ae06d 100755 --- a/src/third_party/wiredtiger/test/suite/test_las01.py +++ b/src/third_party/wiredtiger/test/suite/test_las01.py @@ -83,10 +83,11 @@ class test_las01(wttest.WiredTigerTestCase): # Skip the initial rows, which were not updated for i in range(0, nrows+1): self.assertEqual(cursor.next(), 0) - if (check_value != cursor.get_value()): - print("Check value : " + str(check_value)) - print("value : " + str(cursor.get_value())) - self.assertTrue(check_value == cursor.get_value()) + if check_value != cursor.get_value(): + session.breakpoint() + self.assertTrue(check_value == cursor.get_value(), + "for key " + str(i) + ", expected " + str(check_value) + + ", got " + str(cursor.get_value())) cursor.close() session.close() conn.close() diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp04.py b/src/third_party/wiredtiger/test/suite/test_timestamp04.py index acbad7e02a4..9e0e4a0cec0 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp04.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp04.py @@ -78,7 +78,8 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): # Search for the expected items as well as iterating. for k, v in expected.items(): if missing == False: - self.assertEqual(cur[k], v, "for key " + str(k)) + self.assertEqual(cur[k], v, "for key " + str(k) + + " expected " + str(v) + ", got " + str(cur[k])) else: cur.set_key(k) if self.empty: @@ -162,7 +163,11 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): # Roll back half timestamps. stable_ts = timestamp_str(key_range // 2) self.conn.set_timestamp('stable_timestamp=' + stable_ts) + + # We're about to test rollback-to-stable which requires a checkpoint to which we can roll back. + self.session.checkpoint() self.conn.rollback_to_stable() + stat_cursor = self.session.open_cursor('statistics:', None, None) calls = stat_cursor[stat.conn.txn_rollback_to_stable][2] upd_aborted = (stat_cursor[stat.conn.txn_rollback_upd_aborted][2] + diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp06.py b/src/third_party/wiredtiger/test/suite/test_timestamp06.py index 55981f67a98..fd004a23703 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp06.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp06.py @@ -157,7 +157,7 @@ class test_timestamp06(wttest.WiredTigerTestCase, suite_subprocess): # Scenario: 1 # Check that we see all the latest values (i.e. 3) as per transaction - # visibility when reading with out the read timestamp. + # visibility when reading without the read timestamp. # All tables should see all the values. self.check(self.session, "", self.table_ts_log, dict((k, 3) for k in orig_keys)) @@ -204,8 +204,12 @@ class test_timestamp06(wttest.WiredTigerTestCase, suite_subprocess): self.ckpt_backup(2, (nkeys - valcnt_ts_log), (nkeys - valcnt_ts_nolog)) # Scenario: 3 - # Check that we see all the data values correctly after rollback + # Check we see all the data values correctly after rollback. Skip the case where the most + # recent checkpoint wasn't based on the last stable timestamp, those can't be rolled back. + if self.ckpt_ts == False: + return self.conn.rollback_to_stable() + # All tables should see the values correctly when read with # read timestamp as stable timestamp. self.check(self.session, 'read_timestamp=' + stable_ts, @@ -214,7 +218,7 @@ class test_timestamp06(wttest.WiredTigerTestCase, suite_subprocess): self.table_ts_log, dict((k, 2) for k in orig_keys)) # Scenario: 4 - # Check that we see the values correctly when read with out any + # Check that we see the values correctly when read without any # timestamp. if self.using_log == True: # For logged table we should see latest values (i.e. 3) when logging @@ -224,21 +228,13 @@ class test_timestamp06(wttest.WiredTigerTestCase, suite_subprocess): else: # When logging is disabled, we should not see the values beyond the # stable timestamp with timestamped checkpoints. - if self.ckpt_ts == True: - self.check(self.session, "", - self.table_ts_log, dict((k, 2) for k in orig_keys)) - else: - self.check(self.session, "", - self.table_ts_log, dict((k, 3) for k in orig_keys)) + self.check(self.session, "", + self.table_ts_log, dict((k, 2) for k in orig_keys)) # For non-logged table we should not see the values beyond the # stable timestamp with timestamped checkpoints. - if self.ckpt_ts == True: - self.check(self.session, "", - self.table_ts_nolog, dict((k, 2) for k in orig_keys)) - else: - self.check(self.session, "", - self.table_ts_nolog, dict((k, 3) for k in orig_keys)) + self.check(self.session, "", + self.table_ts_nolog, dict((k, 2) for k in orig_keys)) if __name__ == '__main__': wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp11.py b/src/third_party/wiredtiger/test/suite/test_timestamp11.py index 1256a544d78..f3d03cd8fa5 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp11.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp11.py @@ -83,6 +83,7 @@ class test_timestamp11(wttest.WiredTigerTestCase, suite_subprocess): # stable_ts = timestamp_str(2) self.conn.set_timestamp('stable_timestamp=' + stable_ts) + self.session.checkpoint() self.conn.rollback_to_stable() c = self.session.open_cursor(uri) diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp16.py b/src/third_party/wiredtiger/test/suite/test_timestamp16.py index bef116d62a9..20663889450 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp16.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp16.py @@ -50,25 +50,21 @@ class test_timestamp16(wttest.WiredTigerTestCase, suite_subprocess): self.session.begin_transaction('read_timestamp=100') self.session.rollback_transaction() self.session.checkpoint('use_timestamp=true') - self.assertTimestampsEqual('0', - self.conn.query_timestamp('get=last_checkpoint')) + self.assertTimestampsEqual('0', self.conn.query_timestamp('get=last_checkpoint')) - # Set a stable and make sure that we still checkpoint at - # the stable. - self.conn.set_timestamp('stable_timestamp=1') + # Set a stable and make sure that we still checkpoint at the stable. + self.conn.set_timestamp('stable_timestamp=2') self.session.begin_transaction('read_timestamp=100') self.session.rollback_transaction() self.session.checkpoint('use_timestamp=true') - self.assertTimestampsEqual('1', - self.conn.query_timestamp('get=last_checkpoint')) + self.assertTimestampsEqual('2', self.conn.query_timestamp('get=last_checkpoint')) # Finally make sure that commit also resets the read timestamp. self.session.create(self.uri, 'key_format=i,value_format=i') self.session.begin_transaction('read_timestamp=150') self.session.commit_transaction() self.session.checkpoint('use_timestamp=true') - self.assertTimestampsEqual('1', - self.conn.query_timestamp('get=last_checkpoint')) + self.assertTimestampsEqual('2', self.conn.query_timestamp('get=last_checkpoint')) if __name__ == '__main__': wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_txn21.py b/src/third_party/wiredtiger/test/suite/test_txn21.py new file mode 100644 index 00000000000..212a4d321b6 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_txn21.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2019 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_txn21.py +# Transactions: smoke test the operation timeout API +# + +import wiredtiger, wttest + +class test_txn21(wttest.WiredTigerTestCase): + + # Connection-level configuration. + def test_operation_timeout_conn(self): + # Close the automatically opened connection and open one with the timeout configuration. + conn_config = 'operation_timeout_ms=2000' + self.conn.close() + self.conn = wiredtiger.wiredtiger_open(self.home, conn_config) + + # Transaction-level configuration. + def test_operation_timeout_txn(self): + self.session.begin_transaction('operation_timeout_ms=2000') + +if __name__ == '__main__': + wttest.run() -- cgit v1.2.1