diff options
author | Luke Chen <luke.chen@mongodb.com> | 2019-12-03 03:01:41 +0000 |
---|---|---|
committer | evergreen <evergreen@mongodb.com> | 2019-12-03 03:01:41 +0000 |
commit | 587f15f0f823924c852b261497110e4b78dca7fe (patch) | |
tree | d2e92233b4d39b061729597b938c42b67502eaa7 /src/third_party | |
parent | 2e948c4e94b17089ab56a5437447f9988c31103d (diff) | |
download | mongo-587f15f0f823924c852b261497110e4b78dca7fe.tar.gz |
Import wiredtiger: d47dcd1f0ea992775be3d60456593c575451c435 from branch mongodb-4.4
ref: 58115abb6f..d47dcd1f0e
for: 4.3.3
WT-4996 Migrate Jenkins “wiredtiger-test-check-long” job to Evergreen
WT-5082 Application threads are tasked with eviction even when pinning the oldest transaction ID
WT-5232 Create a wrapper script to support format stress tests in Evergreen
WT-5265 Remove pip install gcovr from coverage-report test
WT-5274 format.sh must handle core-dump signals and "gdb attach" build mode
Diffstat (limited to 'src/third_party')
17 files changed, 713 insertions, 113 deletions
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_track.sh b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_track.sh index db92cb95931..db92cb95931 100644..100755 --- a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_track.sh +++ b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_track.sh diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_xray.sh b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_xray.sh index 398c6a9bcf5..398c6a9bcf5 100644..100755 --- a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_xray.sh +++ b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_xray.sh diff --git a/src/third_party/wiredtiger/build_posix/configure.ac.in b/src/third_party/wiredtiger/build_posix/configure.ac.in index c50d86678e6..30cdd4e5d06 100644 --- a/src/third_party/wiredtiger/build_posix/configure.ac.in +++ b/src/third_party/wiredtiger/build_posix/configure.ac.in @@ -172,8 +172,8 @@ AC_CHECK_LIB(dl, dlopen) AC_CHECK_LIB(rt, sched_yield) AC_CHECK_FUNCS([\ - clock_gettime fallocate ftruncate gettimeofday posix_fadvise\ - posix_fallocate posix_madvise strtouq sync_file_range timer_create]) + clock_gettime fallocate ftruncate gettimeofday posix_fadvise posix_fallocate\ + posix_madvise setrlimit strtouq sync_file_range timer_create]) # OS X wrongly reports that it has fdatasync AS_CASE([$host_os], [darwin*], [], [AC_CHECK_FUNCS([fdatasync])]) diff --git a/src/third_party/wiredtiger/build_win/wiredtiger_config.h b/src/third_party/wiredtiger/build_win/wiredtiger_config.h index 7b2d3fd63bf..c5c0dfda580 100644 --- a/src/third_party/wiredtiger/build_win/wiredtiger_config.h +++ b/src/third_party/wiredtiger/build_win/wiredtiger_config.h @@ -79,6 +79,9 @@ /* Define to 1 if pthread condition variables support monotonic clocks. */ /* #undef HAVE_PTHREAD_COND_MONOTONIC */ +/* Define to 1 if you have the `setrlimit' function. */ +/* #undef HAVE_SETRLIMIT */ + /* Define to 1 if you have the `posix_fadvise' function. */ /* #undef HAVE_POSIX_FADVISE */ diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 362efcebaff..10065020dd8 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "58115abb6fbb3c1cc7bfd087d41a47347bce9a69", + "commit": "d47dcd1f0ea992775be3d60456593c575451c435", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-4.4" diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 7f729c2e661..0faaacc710c 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -2296,9 +2296,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d * rolled back. Ignore if in recovery, those transactions can't be rolled back. */ if (!F_ISSET(conn, WT_CONN_RECOVERING) && __wt_cache_stuck(session)) { - ret = __wt_txn_is_blocking_old(session); - if (ret == 0) - ret = __wt_txn_is_blocking_pin(session); + ret = __wt_txn_is_blocking(session); if (ret == WT_ROLLBACK) { --cache->evict_aggressive_score; WT_STAT_CONN_INCR(session, txn_fail_cache); diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h index 36cefa8dc68..533f276b15c 100644 --- a/src/third_party/wiredtiger/src/include/api.h +++ b/src/third_party/wiredtiger/src/include/api.h @@ -70,6 +70,7 @@ if ((ret) != 0 && (ret) != WT_NOTFOUND && (ret) != WT_DUPLICATE_KEY && \ (ret) != WT_PREPARE_CONFLICT && F_ISSET(&(s)->txn, WT_TXN_RUNNING)) \ F_SET(&(s)->txn, WT_TXN_ERROR); \ + __wt_op_timer_stop(s); \ /* \ * No code after this line, otherwise error handling \ * won't be correct. \ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index d42e0d43d9d..2b00f07ae07 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -1426,9 +1426,7 @@ extern int __wt_txn_global_shutdown(WT_SESSION_IMPL *session, const char *config WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_txn_is_blocking_old(WT_SESSION_IMPL *session) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_txn_is_blocking_pin(WT_SESSION_IMPL *session) +extern int __wt_txn_is_blocking(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -2109,6 +2107,7 @@ static inline void __wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session); static inline void __wt_cursor_dhandle_incr_use(WT_SESSION_IMPL *session); static inline void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp); static inline void __wt_op_timer_start(WT_SESSION_IMPL *session); +static inline void __wt_op_timer_stop(WT_SESSION_IMPL *session); static inline void __wt_page_evict_soon(WT_SESSION_IMPL *session, WT_REF *ref); static inline void __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page); static inline void __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page); diff --git a/src/third_party/wiredtiger/src/include/time.i b/src/third_party/wiredtiger/src/include/time.i index 0dd6781216e..208243ef612 100644 --- a/src/third_party/wiredtiger/src/include/time.i +++ b/src/third_party/wiredtiger/src/include/time.i @@ -160,7 +160,18 @@ __wt_clock_to_nsec(uint64_t end, uint64_t begin) static inline void __wt_op_timer_start(WT_SESSION_IMPL *session) { - session->operation_start_us = session->operation_timeout_us == 0 ? 0 : __wt_clock(session); + uint64_t timeout_us; + + /* Timer can be configured per-transaction, and defaults to per-connection. */ + if ((timeout_us = session->txn.operation_timeout_us) == 0) + timeout_us = S2C(session)->operation_timeout_us; + if (timeout_us == 0) + session->operation_start_us = session->operation_timeout_us = 0; + else { + session->operation_start_us = __wt_clock(session); + session->operation_timeout_us = timeout_us; + } + #ifdef HAVE_DIAGNOSTIC /* * This is called at the beginning of each API call. We need to clear out any old values from @@ -172,6 +183,16 @@ __wt_op_timer_start(WT_SESSION_IMPL *session) } /* + * __wt_op_timer_stop -- + * Stop the operations timer. + */ +static inline void +__wt_op_timer_stop(WT_SESSION_IMPL *session) +{ + session->operation_start_us = session->operation_timeout_us = 0; +} + +/* * __wt_op_timer_fired -- * Check the operations timers. */ @@ -180,8 +201,7 @@ __wt_op_timer_fired(WT_SESSION_IMPL *session) { uint64_t diff, now; - /* Check for both a timeout and a start time to avoid any future configuration races. */ - if (session->operation_timeout_us == 0 || session->operation_start_us == 0) + if (session->operation_start_us == 0 || session->operation_timeout_us == 0) return (false); now = __wt_clock(session); diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index bdda7a4eae9..59d201e5110 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -320,6 +320,9 @@ struct __wt_txn { WT_ITEM *ckpt_snapshot; bool full_ckpt; + /* Timeout */ + uint64_t operation_timeout_us; + const char *rollback_reason; /* If rollback, the reason */ /* diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 5d4f4f8495d..8962d268459 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -469,9 +469,7 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) /* Retrieve the maximum operation time, defaulting to the database-wide configuration. */ WT_RET(__wt_config_gets(session, cfg, "operation_timeout_ms", &cval)); - session->operation_timeout_us = (uint64_t)(cval.val * WT_THOUSAND); - if (session->operation_timeout_us == 0) - session->operation_timeout_us = S2C(session)->operation_timeout_us; + txn->operation_timeout_us = (uint64_t)(cval.val * WT_THOUSAND); /* * The default sync setting is inherited from the connection, but can be overridden by an @@ -621,7 +619,7 @@ __wt_txn_release(WT_SESSION_IMPL *session) txn->prepare_timestamp = WT_TS_NONE; /* Clear operation timer. */ - session->operation_timeout_us = 0; + txn->operation_timeout_us = 0; } /* @@ -1589,90 +1587,43 @@ __wt_txn_global_shutdown(WT_SESSION_IMPL *session, const char *config, const cha } /* - * __wt_txn_is_blocking_old -- - * Return if this transaction is the oldest transaction in the system, called by eviction to - * determine if a worker thread should be released from eviction. + * __wt_txn_is_blocking -- + * Return if this transaction is likely blocking eviction because of a pinned transaction ID, + * called by eviction to determine if a worker thread should be released from eviction. */ int -__wt_txn_is_blocking_old(WT_SESSION_IMPL *session) +__wt_txn_is_blocking(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_TXN *txn; - WT_TXN_GLOBAL *txn_global; - WT_TXN_STATE *state; - uint64_t id; - uint32_t i, session_cnt; + uint64_t txn_oldest; conn = S2C(session); txn = &session->txn; - txn_global = &conn->txn_global; - if (txn->id == WT_TXN_NONE || F_ISSET(txn, WT_TXN_PREPARE)) + /* We can't roll back prepared transactions. */ + if (F_ISSET(txn, WT_TXN_PREPARE)) return (false); - WT_ORDERED_READ(session_cnt, conn->session_cnt); - /* - * Check if the transaction is oldest one in the system. It's safe to ignore sessions allocating - * transaction IDs, since we already have an ID, they are guaranteed to be newer. + * Check the oldest transaction ID of either the current transaction ID or the snapshot. Using + * the snapshot potentially means rolling back a read-only transaction, which MongoDB can't + * (yet) handle. For this reason, don't use the snapshot unless there's also a transaction ID + * or we're configured to time out thread operations (a way to confirm our caller is prepared + * for rollback). */ - for (i = 0, state = txn_global->states; i < session_cnt; i++, state++) { - if (state->is_allocating) - continue; - - WT_ORDERED_READ(id, state->id); - if (id != WT_TXN_NONE && WT_TXNID_LT(id, txn->id)) - break; - } - return (i == session_cnt ? - __wt_txn_rollback_required(session, "oldest transaction ID rolled back for eviction") : + txn_oldest = txn->id; + if (F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && txn->snap_min != WT_TXN_NONE && + (txn_oldest != WT_TXN_NONE || __wt_op_timer_fired(session)) && + (txn_oldest == WT_TXN_NONE || WT_TXNID_LT(txn->snap_min, txn_oldest))) + txn_oldest = txn->snap_min; + return (txn_oldest == conn->txn_global.oldest_id ? + __wt_txn_rollback_required( + session, "oldest pinned transaction ID rolled back for eviction") : 0); } /* - * __wt_txn_is_blocking_pin -- - * Return if this transaction is likely blocking eviction because of a pinned transaction ID, - * called by eviction to determine if a worker thread should be released from eviction. - */ -int -__wt_txn_is_blocking_pin(WT_SESSION_IMPL *session) -{ - WT_CONNECTION_IMPL *conn; - WT_SESSION_IMPL *s; - WT_TXN *txn; - uint64_t snap_min; - uint32_t i, session_cnt; - - conn = S2C(session); - txn = &session->txn; - - /* - * Check if we hold the oldest pinned transaction ID in the system. This potentially means - * rolling back a read-only transaction, which MongoDB can't (yet) handle. For this reason, - * don't check unless we're configured to time out thread operations, a way to confirm our - * caller is prepared for rollback. - */ - if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) || txn->snap_min == WT_TXN_NONE) - return (0); - if (!__wt_op_timer_fired(session)) - return (0); - - WT_ORDERED_READ(session_cnt, conn->session_cnt); - - for (s = conn->sessions, i = 0; i < session_cnt; ++s, ++i) { - if (F_ISSET(s, WT_SESSION_INTERNAL) || !F_ISSET(&s->txn, WT_TXN_HAS_SNAPSHOT)) - continue; - - WT_ORDERED_READ(snap_min, s->txn.snap_min); - if (snap_min != WT_TXN_NONE && snap_min < txn->snap_min) - break; - } - return (i == session_cnt ? __wt_txn_rollback_required( - session, "oldest pinned transaction ID rolled back for eviction") : - 0); -} - -/* * __wt_verbose_dump_txn_one -- * Output diagnostic information about a transaction structure. */ diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml index b4677e3293d..e28772c915b 100755 --- a/src/third_party/wiredtiger/test/evergreen.yml +++ b/src/third_party/wiredtiger/test/evergreen.yml @@ -134,6 +134,30 @@ functions: for i in $(seq ${times|1}); do ./t -1 -c ${config|../../../test/format/CONFIG.stress} ${extra_args|} done + "many dbs test": + command: shell.exec + parms: + working_dir: "wiredtiger/build_posix/test/manydbs" + script: | + set -o errexit + set -o verbose + ${test_env_vars|} ./t ${many_db_args|} + "thread test": + command: shell.exec + parms: + working_dir: "wiredtiger/build_posix/test/thread" + script: | + set -o errexit + set -o verbose + ${test_env_vars|} ./t ${thread_test_args|} + "random abort test": + command: shell.exec + parms: + working_dir: "wiredtiger/build_posix/test/csuite" + script: | + set -o errexit + set -o verbose + ${test_env_vars|} ./test_random_abort ${random_abort_args|} "upload artifact": - command: archive.targz_pack params: @@ -1579,9 +1603,8 @@ tasks: script: | set -o errexit set -o verbose - # FIX ME Remove once BUILD-5025 is done - pip install gcovr --user - GCOV=/opt/mongodbtoolchain/v3/bin/gcov /home/ubuntu/.local/bin/gcovr -r .. -e '.*/bt_(debug|dump|misc|salvage|vrfy).*' -e '.*/(log|progress|verify_build|strerror|env_msg|err_file|cur_config|os_abort)\..*' -e '.*_stat\..*' --html -o ../coverage_report.html + + GCOV=/opt/mongodbtoolchain/v3/bin/gcov gcovr -r .. -e '.*/bt_(debug|dump|misc|salvage|vrfy).*' -e '.*/(log|progress|verify_build|strerror|env_msg|err_file|cur_config|os_abort)\..*' -e '.*_stat\..*' --html -o ../coverage_report.html - command: s3.put params: aws_secret: ${aws_secret} @@ -1638,6 +1661,111 @@ tasks: cp -rf WT_TEST WT_TEST_$file done + - name: ftruncate-test + commands: + - func: "get project" + - func: "compile wiredtiger" + vars: + posix_configure_flags: ac_cv_func_ftruncate=no + - command: shell.exec + params: + working_dir: "wiredtiger/build_posix" + script: | + set -o errexit + set -o verbose + ${test_env_vars|} $(pwd)/../test/csuite/random_abort/smoke.sh 2>&1 + ${test_env_vars|} $(pwd)/../test/csuite/timestamp_abort/smoke.sh 2>&1 + ${test_env_vars|} $(pwd)/test/csuite/test_truncated_log 2>&1 + + - name: long-test + commands: + - func: "get project" + - func: "configure wiredtiger" + vars: + configure_env_vars: CC=/opt/mongodbtoolchain/v3/bin/gcc CXX=/opt/mongodbtoolchain/v3/bin/g++ PATH=/opt/mongodbtoolchain/v3/bin:$PATH CFLAGS="-g -Werror" + posix_configure_flags: --enable-silent-rules --enable-diagnostic --disable-static + - func: "make wiredtiger" + + # Run the long version of make check, that includes the full csuite tests + - func: "make check all" + vars: + test_env_vars: ${test_env_vars} TESTUTIL_ENABLE_LONG_TESTS=1 + - command: shell.exec + params: + working_dir: "wiredtiger/build_posix" + script: | + set -o errexit + set -o verbose + + WT3363_CHECKPOINT_OP_RACES=1 test/csuite/./test_wt3363_checkpoint_op_races 2>&1 + + # Many dbs test - Run with: + # 1. The defaults + - func: "many dbs test" + # 2. Set idle flag to turn off operations. + - func: "many dbs test" + vars: + many_db_args: -I + # 3. More dbs. + - func: "many dbs test" + vars: + many_db_args: -D 40 + # 4. With idle flag and more dbs. + - func: "many dbs test" + vars: + many_db_args: -I -D 40 + + # extended test/thread runs + - func: "thread test" + vars: + thread_test_args: -t f + - func: "thread test" + vars: + thread_test_args: -S -F -n 100000 -t f + - func: "thread test" + vars: + thread_test_args: -t r + - func: "thread test" + vars: + thread_test_args: -S -F -n 100000 -t r + - func: "thread test" + vars: + thread_test_args: -t v + - func: "thread test" + vars: + thread_test_args: -S -F -n 100000 -t v + + # random-abort - default (random time and number of threads) + - func: "random abort test" + # random-abort - minimum time, random number of threads + - func: "random abort test" + vars: + random_abort_args: -t 10 + # random-abort - maximum time, random number of threads + - func: "random abort test" + vars: + random_abort_args: -t 40 + + # truncated-log + - command: shell.exec + params: + working_dir: "wiredtiger/build_posix/test/csuite/" + script: | + set -o errexit + set -o verbose + + ./test_truncated_log + + # format test + - func: "test format" + vars: + extra_args: file_type=fix + - func: "test format" + vars: + extra_args: file_type=row + + #FIXME: Add wtperf testing from Jenkin "wiredtiger-test-check-long" after fixing WT-5270 + - name: time-shift-sensitivity-test depends_on: - name: compile @@ -1683,6 +1811,8 @@ buildvariants: - name: spinlock-pthread-adaptive-test - name: compile-wtperf - name: wtperf-test + - name: ftruncate-test + - name: long-test - name: ubuntu1804-python3 display_name: Ubuntu 18.04 (Python3) diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index bae89f7e2f6..66c770cc809 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -28,6 +28,9 @@ #include "test_util.h" +#ifdef HAVE_SETRLIMIT +#include <sys/resource.h> +#endif #include <signal.h> #define EXTPATH "../../ext/" /* Extensions path */ @@ -349,6 +352,7 @@ WT_THREAD_RET random_kv(void *); void path_setup(const char *); int read_row_worker(WT_CURSOR *, uint64_t, WT_ITEM *, WT_ITEM *, bool); uint32_t rng(WT_RAND_STATE *); +void set_core_off(void); void snap_init(TINFO *, uint64_t, bool); void snap_repeat_single(WT_CURSOR *, TINFO *); int snap_repeat_txn(WT_CURSOR *, TINFO *); diff --git a/src/third_party/wiredtiger/test/format/format.sh b/src/third_party/wiredtiger/test/format/format.sh new file mode 100755 index 00000000000..722df756afe --- /dev/null +++ b/src/third_party/wiredtiger/test/format/format.sh @@ -0,0 +1,442 @@ +#! /bin/bash + +[ -z $BASH_VERSION ] && { + echo "$0 is a bash script: \$BASH_VERSION not set, exiting" + exit 1 +} + +name=$(basename $0) + +quit=0 +force_quit=0 +onintr() +{ + echo "$name: interrupted, cleaning up..." + force_quit=1 +} +trap 'onintr' 2 + +usage() { + echo "usage: $0 [-aFSv] [-c config] " + echo " [-h home] [-j parallel-jobs] [-n total-jobs] [-t minutes] [format-configuration]" + echo + echo " -a abort/recovery testing (defaults to off)" + echo " -c config format configuration file (defaults to CONFIG.stress)" + echo " -F quit on first failure (defaults to off)" + echo " -h home run directory (defaults to .)" + echo " -j parallel jobs to execute in parallel (defaults to 8)" + echo " -n total total jobs to execute (defaults to no limit)" + echo " -S run smoke-test configurations (defaults to off)" + echo " -t minutes minutes to run (defaults to no limit)" + echo " -v verbose output (defaults to off)" + echo " -- separates $name arguments from format arguments" + + exit 1 +} + +# Smoke-tests. +smoke_base_1="data_source=table rows=100000 threads=6 timer=4" +smoke_base_2="$smoke_base_1 leaf_page_max=9 internal_page_max=9" +smoke_list=( + # Three access methods. + "$smoke_base_1 file_type=fix" + "$smoke_base_1 file_type=row" + "$smoke_base_1 file_type=var" + + # Huffman key/value encoding. + "$smoke_base_1 file_type=row huffman_key=1 huffman_value=1" + "$smoke_base_1 file_type=var huffman_key=1 huffman_value=1" + + # Abort/recovery test. + "$smoke_base_1 file_type=row abort=1" + + # LSM + "$smoke_base_1 file_type=row data_source=lsm" + + # Force tree rebalance and the statistics server. + "$smoke_base_1 file_type=row statistics_server=1 rebalance=1" + + # Overflow testing. + "$smoke_base_2 file_type=var value_min=256" + "$smoke_base_2 file_type=row key_min=256" + "$smoke_base_2 file_type=row key_min=256 value_min=256" +) +smoke_next=0 + +abort_test=0 +build="" +config="CONFIG.stress" +first_failure=0 +format_args="" +home="." +minutes=0 +parallel_jobs=8 +smoke_test=0 +total_jobs=0 +verbose=0 + +while :; do + case "$1" in + -a) + abort_test=1 + shift ;; + -c) + config="$2" + shift ; shift ;; + -F) + first_failure=1 + shift ;; + -h) + home="$2" + shift ; shift ;; + -j) + parallel_jobs="$2" + [[ "$parallel_jobs" =~ ^[1-9][0-9]*$ ]] || { + echo "$name: -j option argument must be a non-zero integer" + exit 1 + } + shift ; shift ;; + -n) + total_jobs="$2" + [[ "$total_jobs" =~ ^[1-9][0-9]*$ ]] || { + echo "$name: -n option argument must be an non-zero integer" + exit 1 + } + shift ; shift ;; + -S) + smoke_test=1 + shift ;; + -t) + minutes="$2" + [[ "$minutes" =~ ^[1-9][0-9]*$ ]] || { + echo "$name: -t option argument must be a non-zero integer" + exit 1 + } + shift ; shift ;; + -v) + verbose=1 + shift ;; + --) + shift; break;; + -*) + usage ;; + *) + break ;; + esac +done +format_args="$*" + +verbose() +{ + [[ $verbose -ne 0 ]] && echo "$@" +} + +verbose "$name: run starting at $(date)" + +# Find a component we need. +# $1 name to find +find_file() +{ + # Get the directory path to format.sh, which is always in wiredtiger/test/format, then + # use that as the base for all the other places we check. + d=$(dirname $0) + + # Check wiredtiger/test/format/, likely location of the format binary and the CONFIG file. + f="$d/$1" + if [[ -f "$f" ]]; then + echo "$f" + return + fi + + # Check wiredtiger/build_posix/test/format/, likely location of the format binary and the + # CONFIG file. + f="$d/../../build_posix/test/format/$1" + if [[ -f "$f" ]]; then + echo "$f" + return + fi + + # Check wiredtiger/, likely location of the wt binary. + f="$d/../../$1" + if [[ -f "$f" ]]; then + echo "$f" + return + fi + + # Check wiredtiger/build_posix/, likely location of the wt binary. + f="$d/../../build_posix/$1" + if [[ -f "$f" ]]; then + echo "$f" + return + fi + + echo "./$1" +} + +# Find the format and wt binaries (the latter is only required for abort/recovery testing), +# the configuration file and the run directory. +format_binary=$(find_file "t") +[[ ! -x "$format_binary" ]] && { + echo "$name: format program \"$format_binary\" not found" + exit 1 +} +[[ $abort_test -ne 0 ]] || [[ $smoke_test -ne 0 ]] && { + wt_binary=$(find_file "wt") + [[ ! -x "$wt_binary" ]] && { + echo "$name: wt program \"$wt_binary\" not found" + exit 1 + } +} +config=$(find_file "$config") +[[ -f "$config" ]] || { + echo "$name: configuration file \"$config\" not found" + exit 1 +} +[[ -d "$home" ]] || { + echo "$name: directory \"$home\" not found" + exit 1 +} + +verbose "$name configuration: $format_binary [-c $config]\ +[-h $home] [-j $parallel_jobs] [-n $total_jobs] [-t $minutes] $format_args" + +failure=0 +success=0 +running=0 +status="format.sh-status" + +# Report a failure. +# $1 directory name +report_failure() +{ + dir=$1 + log="$dir.log" + + echo "$name: failure status reported" > $dir/$status + failure=$(($failure + 1)) + + # Forcibly quit if first-failure configured. + [[ $first_failure -ne 0 ]] && force_quit=1 + + echo "$name: job in $dir failed" + echo "$name: $dir log:" + sed 's/^/ > /' < $log +} + +# Resolve/cleanup completed jobs. +resolve() +{ + running=0 + list=$(ls $home | grep '^RUNDIR.[0-9]*$') + for i in $list; do + dir="$home/$i" + log="$dir.log" + + # Skip directories that aren't ours. + [[ ! -f "$log" ]] && continue + + # Skip failures we've already reported. + [[ -f "$dir/$status" ]] && continue + + # Get the process ID, ignore any jobs that aren't yet running. + pid=`grep -E 'process.*running' $log | awk '{print $3}'` + [[ "$pid" =~ ^[1-9][0-9]*$ ]] || continue + + # Leave any process waiting for a gdb attach running, but report it as a failure. + grep -E 'waiting for debugger' $log > /dev/null && { + report_failure $dir + continue + } + + # If the job is still running, ignore it unless we're forcibly quitting. + kill -s 0 $pid > /dev/null 2>&1 && { + [[ $force_quit -eq 0 ]] && { + running=$((running + 1)) + continue + } + kill -s TERM $pid + } + + # Wait for the job and get an exit status. + wait $pid + eret=$? + + # Remove successful jobs. + grep 'successful run completed' $log > /dev/null && { + rm -rf $dir $log + success=$(($success + 1)) + verbose "$name: job in $dir successfully completed" + continue + } + + # Remove jobs we killed. + grep 'caught signal' $log > /dev/null && { + rm -rf $dir $log + verbose "$name: job in $dir signalled" + continue + } + + # Test recovery on jobs configured for random abort. */ + grep 'aborting to test recovery' $log > /dev/null && { + cp -pr $dir $dir.RECOVER + + (echo + echo "$name: running recovery after abort test" + echo "$name: original directory copied into $dir.RECOVER" + echo) >> $log + + # Everything is a table unless explicitly a file. + uri="table:wt" + grep 'data_source=file' $dir/CONFIG > /dev/null && uri="file:wt" + + # Use the wt utility to recover & verify the object. + if $($wt_binary -R -h $dir verify $uri >> $log 2>&1); then + rm -rf $dir $dir.RECOVER $log + success=$(($success + 1)) + verbose "$name: job in $dir successfully completed" + else + echo "$name: job in $dir failed abort/recovery testing" + report_failure $dir + fi + continue + } + + # Check for the library abort message, or an error from format. + grep -E 'aborting WiredTiger library|run FAILED' $log > /dev/null && { + report_failure $dir + continue + } + + # There's some chance we just dropped core. We have the exit status of the process, + # but there's no way to be sure. There are reasons the process' exit status looks + # like a core dump was created (format deliberately causes a segfault in the case + # of abort/recovery testing, and does work that can often segfault in the case of a + # snapshot-isolation mismatch failure), but those cases have already been handled, + # format is responsible for logging a failure before the core can happen. If the + # process exited with a likely failure, call it a failure. + signame="" + case $eret in + $((128 + 3))) + signame="SIGQUIT";; + $((128 + 4))) + signame="SIGILL";; + $((128 + 6))) + signame="SIGABRT";; + $((128 + 7))) + signame="SIGBUS";; + $((128 + 8))) + signame="SIGFPE";; + $((128 + 11))) + signame="SIGSEGV";; + $((128 + 24))) + signame="SIGXCPU";; + $((128 + 25))) + signame="SIGXFSZ";; + $((128 + 31))) + signame="SIGSYS";; + esac + [[ ! -z $signame ]] && { + (echo + echo "$name: job in $dir killed with signal $signame" + echo "$name: there may be a core dump associated with this failure" + echo) >> $log + + echo "$name: job in $dir killed with signal $signame" + echo "$name: there may be a core dump associated with this failure" + + report_failure $dir + continue + } + + done + return 0 +} + +# Start a single job. +count_jobs=0 +format() +{ + count_jobs=$(($count_jobs + 1)) + dir="$home/RUNDIR.$count_jobs" + log="$dir.log" + + if [[ $smoke_test -ne 0 ]]; then + args=${smoke_list[$smoke_next]} + smoke_next=$(($smoke_next + 1)) + echo "$name: starting smoke-test job in $dir" + else + args=$format_args + + # If abort/recovery testing is configured, do it 5% of the time. + [[ $abort_test -ne 0 ]] && [[ $(($count_jobs % 20)) -eq 0 ]] && args="$args abort=1" + + echo "$name: starting job in $dir" + fi + + cmd="$format_binary -c "$config" -h "$dir" -1 $args quiet=1" + verbose "$name: $cmd" + + # Disassociate the command from the shell script so we can exit and let the command + # continue to run. + nohup $cmd > $log 2>&1 & +} + +seconds=$((minutes * 60)) +start_time="$(date -u +%s)" +while :; do + # Check if our time has expired. + [[ $seconds -ne 0 ]] && { + now="$(date -u +%s)" + elapsed=$(($now - $start_time)) + + # If we've run out of time, terminate all running jobs. + [[ $elapsed -ge $seconds ]] && { + verbose "$name: run timed out at $(date)" + force_quit=1 + } + } + + # Start more jobs. + while :; do + # Check if we're only running the smoke-tests and we're done. + [[ $smoke_test -ne 0 ]] && [[ $smoke_next -ge ${#smoke_list[@]} ]] && quit=1 + + # Check if the total number of jobs has been reached. + [[ $total_jobs -ne 0 ]] && [[ $count_jobs -ge $total_jobs ]] && quit=1 + + # Check if less than 60 seconds left on any timer. The goal is to avoid killing + # jobs that haven't yet configured signal handlers, because we rely on handler + # output to determine their final status. + [[ $seconds -ne 0 ]] && [[ $(($seconds - $elapsed)) -lt 60 ]] && quit=1 + + # Don't create more jobs if we're quitting for any reason. + [[ $force_quit -ne 0 ]] || [[ $quit -ne 0 ]] && break; + + # Check if the maximum number of jobs in parallel has been reached. + [[ $running -ge $parallel_jobs ]] && break + running=$(($running + 1)) + + # Start another job, but don't pound on the system. + format + sleep 2 + done + + # Clean up and update status. + success_save=$success + failure_save=$failure + resolve + [[ $success -ne $success_save ]] || [[ $failure -ne $failure_save ]] && + echo "$name: $success successful jobs, $failure failed jobs" + + # Quit if we're done and there aren't any jobs left to wait for. + [[ $quit -ne 0 ]] || [[ $force_quit -ne 0 ]] && [[ $running -eq 0 ]] && break + + # Wait for awhile, unless there are jobs to start. + [[ $running -ge $parallel_jobs ]] && sleep 10 +done + +echo "$name: $success successful jobs, $failure failed jobs" + +verbose "$name: run ending at $(date)" +[[ $failure -ne 0 ]] && exit 1 +exit 0 diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index f136372260c..d74e5cda0c0 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -60,6 +60,10 @@ modify_repl_init(void) modify_repl[i] = "zyxwvutsrqponmlkjihgfedcba"[i % 26]; } +/* + * set_alarm -- + * Set a timer. + */ static void set_alarm(void) { @@ -75,6 +79,41 @@ set_alarm(void) #endif } +/* + * set_core_off -- + * Turn off core dumps. + */ +void +set_core_off(void) +{ +#ifdef HAVE_SETRLIMIT + struct rlimit rlim; + + rlim.rlim_cur = rlim.rlim_max = 0; + testutil_check(setrlimit(RLIMIT_CORE, &rlim)); +#endif +} + +/* + * random_failure -- + * Fail the process. + */ +static void +random_failure(void) +{ + static char *core = NULL; + + /* Let our caller know. */ + printf("%s: aborting to test recovery\n", progname); + fflush(stdout); + + /* Turn off core dumps. */ + set_core_off(); + + /* Fail at a random moment. */ + *core = 0; +} + TINFO **tinfo_list; /* @@ -222,10 +261,8 @@ wts_ops(bool lastrun) /* * On the last execution, optionally drop core for recovery testing. */ - if (lastrun && g.c_abort) { - static char *core = NULL; - *core = 0; - } + if (lastrun && g.c_abort) + random_failure(); tinfo->quit = true; } } diff --git a/src/third_party/wiredtiger/test/format/snap.c b/src/third_party/wiredtiger/test/format/snap.c index eed296e212f..15df14b71dc 100644 --- a/src/third_party/wiredtiger/test/format/snap.c +++ b/src/third_party/wiredtiger/test/format/snap.c @@ -229,14 +229,16 @@ snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap) #ifdef HAVE_DIAGNOSTIC /* * We have a mismatch. Try to print out as much information as we can. In doing so, we are - * calling into the debug code directly and that does not take locks. So it is possible that the - * calls may crash in some way. - * - * The most important information is the key/value mismatch information. Then try to dump out - * the other information. Right now we dump the entire lookaside table including what is on - * disk. That can potentially be very large. If it becomes a problem, this can be modified to - * just dump out the page this key is on. + * calling into the debug code directly and that does not take locks, so it's possible we will + * simply drop core. The most important information is the key/value mismatch information. Then + * try to dump out the other information. Right now we dump the entire lookaside table including + * what is on disk. That can potentially be very large. If it becomes a problem, this can be + * modified to just dump out the page this key is on. Write a failure message into the log file + * first so format.sh knows we failed, and turn off core dumps. */ + fprintf(stderr, "\n%s: run FAILED\n", progname); + set_core_off(); + fprintf(stderr, "snapshot-isolation error: Dumping page to %s\n", g.home_pagedump); testutil_check(__wt_debug_cursor_page(cursor, g.home_pagedump)); fprintf(stderr, "snapshot-isolation error: Dumping LAS to %s\n", g.home_lasdump); @@ -244,16 +246,8 @@ snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap) if (g.logging) testutil_check(cursor->session->log_flush(cursor->session, "sync=off")); #endif - switch (g.type) { - case FIX: - case VAR: - testutil_die(ret, "snapshot-isolation: %" PRIu64 " search mismatch", keyno); - /* NOTREACHED */ - case ROW: - testutil_die( - ret, "snapshot-isolation: %.*s search mismatch", (int)key->size, (char *)key->data); - /* NOTREACHED */ - } + + testutil_assert(0); /* NOTREACHED */ return (1); diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c index 7a43ca9f9b4..7ddfe37191c 100644 --- a/src/third_party/wiredtiger/test/format/t.c +++ b/src/third_party/wiredtiger/test/format/t.c @@ -38,15 +38,29 @@ extern int __wt_optind; extern char *__wt_optarg; /* + * signal_timer -- + * Alarm signal handler, report the signal and drop core. + */ +static void signal_timer(int signo) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); +static void +signal_timer(int signo) +{ + fprintf(stderr, "format caught signal %d, aborting the process\n", signo); + fflush(stderr); + __wt_abort(NULL); +} + +/* * signal_handler -- - * Handle signals. + * Generic signal handler, report the signal and exit. */ static void signal_handler(int signo) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); static void signal_handler(int signo) { - fprintf(stderr, "format caught signal %d, aborting the process\n", signo); - __wt_abort(NULL); + fprintf(stderr, "format caught signal %d, exiting\n", signo); + fflush(stderr); + exit(0); } int @@ -64,9 +78,10 @@ main(int argc, char *argv[]) /* * Windows and Linux support different sets of signals, be conservative about installing handlers. + * If we time out, we want a core dump, otherwise, just exit. */ #ifdef SIGALRM - (void)signal(SIGALRM, signal_handler); + (void)signal(SIGALRM, signal_timer); #endif #ifdef SIGHUP (void)signal(SIGHUP, signal_handler); @@ -179,7 +194,8 @@ main(int argc, char *argv[]) testutil_check(pthread_rwlock_init(&g.death_lock, NULL)); testutil_check(pthread_rwlock_init(&g.ts_lock, NULL)); - printf("%s: process %" PRIdMAX "\n", progname, (intmax_t)getpid()); + printf("%s: process %" PRIdMAX " running\n", progname, (intmax_t)getpid()); + fflush(stdout); while (++g.run_cnt <= g.c_runs || g.c_runs == 0) { startup(); /* Start a run */ @@ -260,6 +276,8 @@ main(int argc, char *argv[]) config_clear(); + printf("%s: successful run completed\n", progname); + return (EXIT_SUCCESS); } @@ -314,7 +332,7 @@ format_die(void) fclose_and_clear(&g.logfp); fclose_and_clear(&g.randfp); - fprintf(stderr, "\n"); + fprintf(stderr, "\n%s: run FAILED\n", progname); /* Display the configuration that failed. */ if (g.run_cnt) |