Import wiredtiger: d47dcd1f0ea992775be3d60456593c575451c435 from branch mongodb-4.4

ref: 58115abb6f..d47dcd1f0e for: 4.3.3 WT-4996 Migrate Jenkins “wiredtiger-test-check-long” job to Evergreen WT-5082 Application threads are tasked with eviction even when pinning the oldest transaction ID WT-5232 Create a wrapper script to support format stress tests in Evergreen WT-5265 Remove pip install gcovr from coverage-report test WT-5274 format.sh must handle core-dump signals and "gdb attach" build mode
author: Luke Chen <luke.chen@mongodb.com> 2019-12-03 03:01:41 +0000
committer: evergreen <evergreen@mongodb.com> 2019-12-03 03:01:41 +0000
commit: 587f15f0f823924c852b261497110e4b78dca7fe (patch)
tree: d2e92233b4d39b061729597b938c42b67502eaa7 /src/third_party
parent: 2e948c4e94b17089ab56a5437447f9988c31103d (diff)
download: mongo-587f15f0f823924c852b261497110e4b78dca7fe.tar.gz
17 files changed, 713 insertions, 113 deletions
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_track.sh b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_track.sh
index db92cb95931..db92cb95931 100644..100755
--- a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_track.sh
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_track.sh
diff --git a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_xray.sh b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_xray.sh
index 398c6a9bcf5..398c6a9bcf5 100644..100755
--- a/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_xray.sh
+++ b/src/third_party/wiredtiger/bench/wtperf/runners/wtperf_xray.sh
diff --git a/src/third_party/wiredtiger/build_posix/configure.ac.in b/src/third_party/wiredtiger/build_posix/configure.ac.in
index c50d86678e6..30cdd4e5d06 100644
--- a/src/third_party/wiredtiger/build_posix/configure.ac.in
+++ b/src/third_party/wiredtiger/build_posix/configure.ac.in
@@ -172,8 +172,8 @@ AC_CHECK_LIB(dl, dlopen)
 AC_CHECK_LIB(rt, sched_yield)
 
 AC_CHECK_FUNCS([\
-	clock_gettime fallocate ftruncate gettimeofday posix_fadvise\
-	posix_fallocate posix_madvise strtouq sync_file_range timer_create])
+	clock_gettime fallocate ftruncate gettimeofday posix_fadvise posix_fallocate\
+	posix_madvise setrlimit strtouq sync_file_range timer_create])
 
 # OS X wrongly reports that it has fdatasync
 AS_CASE([$host_os], [darwin*], [], [AC_CHECK_FUNCS([fdatasync])])
diff --git a/src/third_party/wiredtiger/build_win/wiredtiger_config.h b/src/third_party/wiredtiger/build_win/wiredtiger_config.h
index 7b2d3fd63bf..c5c0dfda580 100644
--- a/src/third_party/wiredtiger/build_win/wiredtiger_config.h
+++ b/src/third_party/wiredtiger/build_win/wiredtiger_config.h
@@ -79,6 +79,9 @@
 /* Define to 1 if pthread condition variables support monotonic clocks. */
 /* #undef HAVE_PTHREAD_COND_MONOTONIC */
 
+/* Define to 1 if you have the `setrlimit' function. */
+/* #undef HAVE_SETRLIMIT */
+
 /* Define to 1 if you have the `posix_fadvise' function. */
 /* #undef HAVE_POSIX_FADVISE */
 
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 362efcebaff..10065020dd8 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,5 +1,5 @@
 {
-    "commit": "58115abb6fbb3c1cc7bfd087d41a47347bce9a69", 
+    "commit": "d47dcd1f0ea992775be3d60456593c575451c435", 
     "github": "wiredtiger/wiredtiger.git", 
     "vendor": "wiredtiger", 
     "branch": "mongodb-4.4"
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index 7f729c2e661..0faaacc710c 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -2296,9 +2296,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d
          * rolled back. Ignore if in recovery, those transactions can't be rolled back.
          */
         if (!F_ISSET(conn, WT_CONN_RECOVERING) && __wt_cache_stuck(session)) {
-            ret = __wt_txn_is_blocking_old(session);
-            if (ret == 0)
-                ret = __wt_txn_is_blocking_pin(session);
+            ret = __wt_txn_is_blocking(session);
             if (ret == WT_ROLLBACK) {
                 --cache->evict_aggressive_score;
                 WT_STAT_CONN_INCR(session, txn_fail_cache);
diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h
index 36cefa8dc68..533f276b15c 100644
--- a/src/third_party/wiredtiger/src/include/api.h
+++ b/src/third_party/wiredtiger/src/include/api.h
@@ -70,6 +70,7 @@
         if ((ret) != 0 && (ret) != WT_NOTFOUND && (ret) != WT_DUPLICATE_KEY && \
           (ret) != WT_PREPARE_CONFLICT && F_ISSET(&(s)->txn, WT_TXN_RUNNING))  \
             F_SET(&(s)->txn, WT_TXN_ERROR);                                    \
+        __wt_op_timer_stop(s);                                                 \
         /*                                                                     \
          * No code after this line, otherwise error handling                   \
          * won't be correct.                                                   \
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index d42e0d43d9d..2b00f07ae07 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -1426,9 +1426,7 @@ extern int __wt_txn_global_shutdown(WT_SESSION_IMPL *session, const char *config
   WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 extern int __wt_txn_init(WT_SESSION_IMPL *session, WT_SESSION_IMPL *session_ret)
   WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_txn_is_blocking_old(WT_SESSION_IMPL *session)
-  WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
-extern int __wt_txn_is_blocking_pin(WT_SESSION_IMPL *session)
+extern int __wt_txn_is_blocking(WT_SESSION_IMPL *session)
   WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
 extern int __wt_txn_log_commit(WT_SESSION_IMPL *session, const char *cfg[])
   WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@@ -2109,6 +2107,7 @@ static inline void __wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session);
 static inline void __wt_cursor_dhandle_incr_use(WT_SESSION_IMPL *session);
 static inline void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp);
 static inline void __wt_op_timer_start(WT_SESSION_IMPL *session);
+static inline void __wt_op_timer_stop(WT_SESSION_IMPL *session);
 static inline void __wt_page_evict_soon(WT_SESSION_IMPL *session, WT_REF *ref);
 static inline void __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page);
 static inline void __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page);
diff --git a/src/third_party/wiredtiger/src/include/time.i b/src/third_party/wiredtiger/src/include/time.i
index 0dd6781216e..208243ef612 100644
--- a/src/third_party/wiredtiger/src/include/time.i
+++ b/src/third_party/wiredtiger/src/include/time.i
@@ -160,7 +160,18 @@ __wt_clock_to_nsec(uint64_t end, uint64_t begin)
 static inline void
 __wt_op_timer_start(WT_SESSION_IMPL *session)
 {
-    session->operation_start_us = session->operation_timeout_us == 0 ? 0 : __wt_clock(session);
+    uint64_t timeout_us;
+
+    /* Timer can be configured per-transaction, and defaults to per-connection. */
+    if ((timeout_us = session->txn.operation_timeout_us) == 0)
+        timeout_us = S2C(session)->operation_timeout_us;
+    if (timeout_us == 0)
+        session->operation_start_us = session->operation_timeout_us = 0;
+    else {
+        session->operation_start_us = __wt_clock(session);
+        session->operation_timeout_us = timeout_us;
+    }
+
 #ifdef HAVE_DIAGNOSTIC
     /*
      * This is called at the beginning of each API call. We need to clear out any old values from
@@ -172,6 +183,16 @@ __wt_op_timer_start(WT_SESSION_IMPL *session)
 }
 
 /*
+ * __wt_op_timer_stop --
+ *     Stop the operations timer.
+ */
+static inline void
+__wt_op_timer_stop(WT_SESSION_IMPL *session)
+{
+    session->operation_start_us = session->operation_timeout_us = 0;
+}
+
+/*
  * __wt_op_timer_fired --
  *     Check the operations timers.
  */
@@ -180,8 +201,7 @@ __wt_op_timer_fired(WT_SESSION_IMPL *session)
 {
     uint64_t diff, now;
 
-    /* Check for both a timeout and a start time to avoid any future configuration races. */
-    if (session->operation_timeout_us == 0 || session->operation_start_us == 0)
+    if (session->operation_start_us == 0 || session->operation_timeout_us == 0)
         return (false);
 
     now = __wt_clock(session);
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index bdda7a4eae9..59d201e5110 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -320,6 +320,9 @@ struct __wt_txn {
     WT_ITEM *ckpt_snapshot;
     bool full_ckpt;
 
+    /* Timeout */
+    uint64_t operation_timeout_us;
+
     const char *rollback_reason; /* If rollback, the reason */
 
 /*
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index 5d4f4f8495d..8962d268459 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -469,9 +469,7 @@ __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[])
 
     /* Retrieve the maximum operation time, defaulting to the database-wide configuration. */
     WT_RET(__wt_config_gets(session, cfg, "operation_timeout_ms", &cval));
-    session->operation_timeout_us = (uint64_t)(cval.val * WT_THOUSAND);
-    if (session->operation_timeout_us == 0)
-        session->operation_timeout_us = S2C(session)->operation_timeout_us;
+    txn->operation_timeout_us = (uint64_t)(cval.val * WT_THOUSAND);
 
     /*
      * The default sync setting is inherited from the connection, but can be overridden by an
@@ -621,7 +619,7 @@ __wt_txn_release(WT_SESSION_IMPL *session)
     txn->prepare_timestamp = WT_TS_NONE;
 
     /* Clear operation timer. */
-    session->operation_timeout_us = 0;
+    txn->operation_timeout_us = 0;
 }
 
 /*
@@ -1589,90 +1587,43 @@ __wt_txn_global_shutdown(WT_SESSION_IMPL *session, const char *config, const cha
 }
 
 /*
- * __wt_txn_is_blocking_old --
- *     Return if this transaction is the oldest transaction in the system, called by eviction to
- *     determine if a worker thread should be released from eviction.
+ * __wt_txn_is_blocking --
+ *     Return if this transaction is likely blocking eviction because of a pinned transaction ID,
+ *     called by eviction to determine if a worker thread should be released from eviction.
  */
 int
-__wt_txn_is_blocking_old(WT_SESSION_IMPL *session)
+__wt_txn_is_blocking(WT_SESSION_IMPL *session)
 {
     WT_CONNECTION_IMPL *conn;
     WT_TXN *txn;
-    WT_TXN_GLOBAL *txn_global;
-    WT_TXN_STATE *state;
-    uint64_t id;
-    uint32_t i, session_cnt;
+    uint64_t txn_oldest;
 
     conn = S2C(session);
     txn = &session->txn;
-    txn_global = &conn->txn_global;
 
-    if (txn->id == WT_TXN_NONE || F_ISSET(txn, WT_TXN_PREPARE))
+    /* We can't roll back prepared transactions. */
+    if (F_ISSET(txn, WT_TXN_PREPARE))
         return (false);
 
-    WT_ORDERED_READ(session_cnt, conn->session_cnt);
-
     /*
-     * Check if the transaction is oldest one in the system. It's safe to ignore sessions allocating
-     * transaction IDs, since we already have an ID, they are guaranteed to be newer.
+     * Check the oldest transaction ID of either the current transaction ID or the snapshot. Using
+     * the snapshot potentially means rolling back a read-only transaction, which MongoDB can't
+     * (yet) handle. For this reason, don't use the snapshot unless there's also a transaction ID
+     * or we're configured to time out thread operations (a way to confirm our caller is prepared
+     * for rollback).
      */
-    for (i = 0, state = txn_global->states; i < session_cnt; i++, state++) {
-        if (state->is_allocating)
-            continue;
-
-        WT_ORDERED_READ(id, state->id);
-        if (id != WT_TXN_NONE && WT_TXNID_LT(id, txn->id))
-            break;
-    }
-    return (i == session_cnt ?
-        __wt_txn_rollback_required(session, "oldest transaction ID rolled back for eviction") :
+    txn_oldest = txn->id;
+    if (F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && txn->snap_min != WT_TXN_NONE &&
+      (txn_oldest != WT_TXN_NONE || __wt_op_timer_fired(session)) &&
+      (txn_oldest == WT_TXN_NONE || WT_TXNID_LT(txn->snap_min, txn_oldest)))
+        txn_oldest = txn->snap_min;
+    return (txn_oldest == conn->txn_global.oldest_id ?
+        __wt_txn_rollback_required(
+          session, "oldest pinned transaction ID rolled back for eviction") :
         0);
 }
 
 /*
- * __wt_txn_is_blocking_pin --
- *     Return if this transaction is likely blocking eviction because of a pinned transaction ID,
- *     called by eviction to determine if a worker thread should be released from eviction.
- */
-int
-__wt_txn_is_blocking_pin(WT_SESSION_IMPL *session)
-{
-    WT_CONNECTION_IMPL *conn;
-    WT_SESSION_IMPL *s;
-    WT_TXN *txn;
-    uint64_t snap_min;
-    uint32_t i, session_cnt;
-
-    conn = S2C(session);
-    txn = &session->txn;
-
-    /*
-     * Check if we hold the oldest pinned transaction ID in the system. This potentially means
-     * rolling back a read-only transaction, which MongoDB can't (yet) handle. For this reason,
-     * don't check unless we're configured to time out thread operations, a way to confirm our
-     * caller is prepared for rollback.
-     */
-    if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) || txn->snap_min == WT_TXN_NONE)
-        return (0);
-    if (!__wt_op_timer_fired(session))
-        return (0);
-
-    WT_ORDERED_READ(session_cnt, conn->session_cnt);
-
-    for (s = conn->sessions, i = 0; i < session_cnt; ++s, ++i) {
-        if (F_ISSET(s, WT_SESSION_INTERNAL) || !F_ISSET(&s->txn, WT_TXN_HAS_SNAPSHOT))
-            continue;
-
-        WT_ORDERED_READ(snap_min, s->txn.snap_min);
-        if (snap_min != WT_TXN_NONE && snap_min < txn->snap_min)
-            break;
-    }
-    return (i == session_cnt ? __wt_txn_rollback_required(
-                                 session, "oldest pinned transaction ID rolled back for eviction") :
-                               0);
-}
-
-/*
  * __wt_verbose_dump_txn_one --
  *     Output diagnostic information about a transaction structure.
  */
diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml
index b4677e3293d..e28772c915b 100755
--- a/src/third_party/wiredtiger/test/evergreen.yml
+++ b/src/third_party/wiredtiger/test/evergreen.yml
@@ -134,6 +134,30 @@ functions:
         for i in $(seq ${times|1}); do
           ./t -1 -c ${config|../../../test/format/CONFIG.stress} ${extra_args|}
         done
+  "many dbs test":
+    command: shell.exec
+    parms:
+      working_dir: "wiredtiger/build_posix/test/manydbs"
+      script: |
+        set -o errexit
+        set -o verbose
+        ${test_env_vars|} ./t ${many_db_args|}
+  "thread test":
+    command: shell.exec
+    parms:
+      working_dir: "wiredtiger/build_posix/test/thread"
+      script: |
+        set -o errexit
+        set -o verbose
+        ${test_env_vars|} ./t ${thread_test_args|}
+  "random abort test":
+    command: shell.exec
+    parms:
+      working_dir: "wiredtiger/build_posix/test/csuite"
+      script: |
+        set -o errexit
+        set -o verbose
+        ${test_env_vars|} ./test_random_abort ${random_abort_args|}
   "upload artifact":
     - command: archive.targz_pack
       params:
@@ -1579,9 +1603,8 @@ tasks:
           script: |
             set -o errexit
             set -o verbose
-            # FIX ME Remove once BUILD-5025 is done
-            pip install gcovr --user
-            GCOV=/opt/mongodbtoolchain/v3/bin/gcov /home/ubuntu/.local/bin/gcovr -r .. -e '.*/bt_(debug|dump|misc|salvage|vrfy).*' -e '.*/(log|progress|verify_build|strerror|env_msg|err_file|cur_config|os_abort)\..*' -e '.*_stat\..*' --html -o ../coverage_report.html
+
+            GCOV=/opt/mongodbtoolchain/v3/bin/gcov gcovr -r .. -e '.*/bt_(debug|dump|misc|salvage|vrfy).*' -e '.*/(log|progress|verify_build|strerror|env_msg|err_file|cur_config|os_abort)\..*' -e '.*_stat\..*' --html -o ../coverage_report.html
       - command: s3.put
         params:
           aws_secret: ${aws_secret}
@@ -1638,6 +1661,111 @@ tasks:
               cp -rf WT_TEST WT_TEST_$file
             done
 
+  - name: ftruncate-test
+    commands: 
+      - func: "get project"
+      - func: "compile wiredtiger"
+        vars:
+          posix_configure_flags: ac_cv_func_ftruncate=no
+      - command: shell.exec
+        params:
+          working_dir: "wiredtiger/build_posix"
+          script: |
+            set -o errexit
+            set -o verbose
+            ${test_env_vars|} $(pwd)/../test/csuite/random_abort/smoke.sh 2>&1
+            ${test_env_vars|} $(pwd)/../test/csuite/timestamp_abort/smoke.sh 2>&1
+            ${test_env_vars|} $(pwd)/test/csuite/test_truncated_log 2>&1
+
+  - name: long-test
+    commands:
+      - func: "get project"
+      - func: "configure wiredtiger"
+        vars:
+          configure_env_vars: CC=/opt/mongodbtoolchain/v3/bin/gcc CXX=/opt/mongodbtoolchain/v3/bin/g++ PATH=/opt/mongodbtoolchain/v3/bin:$PATH CFLAGS="-g -Werror"
+          posix_configure_flags: --enable-silent-rules --enable-diagnostic --disable-static
+      - func: "make wiredtiger"
+      
+      # Run the long version of make check, that includes the full csuite tests
+      - func: "make check all"
+        vars:
+          test_env_vars: ${test_env_vars} TESTUTIL_ENABLE_LONG_TESTS=1
+      - command: shell.exec
+        params:
+          working_dir: "wiredtiger/build_posix"
+          script: |
+            set -o errexit
+            set -o verbose
+
+            WT3363_CHECKPOINT_OP_RACES=1 test/csuite/./test_wt3363_checkpoint_op_races 2>&1
+      
+      # Many dbs test - Run with:
+      # 1.  The defaults
+      - func: "many dbs test"
+      # 2.  Set idle flag to turn off operations.
+      - func: "many dbs test"
+        vars:
+          many_db_args: -I
+      # 3.  More dbs.
+      - func: "many dbs test"
+        vars:
+          many_db_args: -D 40
+      # 4.  With idle flag and more dbs.
+      - func: "many dbs test"
+        vars:
+          many_db_args: -I -D 40
+      
+      # extended test/thread runs
+      - func: "thread test"
+        vars: 
+          thread_test_args: -t f
+      - func: "thread test"
+        vars: 
+          thread_test_args: -S -F -n 100000 -t f
+      - func: "thread test"
+        vars: 
+          thread_test_args: -t r
+      - func: "thread test"
+        vars: 
+          thread_test_args: -S -F -n 100000 -t r
+      - func: "thread test"
+        vars: 
+          thread_test_args: -t v
+      - func: "thread test"
+        vars:
+          thread_test_args: -S -F -n 100000 -t v
+      
+      # random-abort - default (random time and number of threads)
+      - func: "random abort test"
+      # random-abort - minimum time, random number of threads
+      - func: "random abort test"
+        vars:
+          random_abort_args: -t 10
+      # random-abort - maximum time, random number of threads
+      - func: "random abort test"
+        vars:
+          random_abort_args: -t 40
+      
+      # truncated-log
+      - command: shell.exec
+        params:
+          working_dir: "wiredtiger/build_posix/test/csuite/"
+          script: |
+            set -o errexit
+            set -o verbose
+
+            ./test_truncated_log
+      
+      # format test
+      - func: "test format"
+        vars:
+          extra_args: file_type=fix
+      - func: "test format"
+        vars:
+          extra_args: file_type=row
+      
+      #FIXME: Add wtperf testing from Jenkin "wiredtiger-test-check-long" after fixing WT-5270
+
   - name: time-shift-sensitivity-test
     depends_on:
       - name: compile
@@ -1683,6 +1811,8 @@ buildvariants:
     - name: spinlock-pthread-adaptive-test
     - name: compile-wtperf
     - name: wtperf-test
+    - name: ftruncate-test
+    - name: long-test
 
 - name: ubuntu1804-python3
   display_name: Ubuntu 18.04 (Python3)
diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h
index bae89f7e2f6..66c770cc809 100644
--- a/src/third_party/wiredtiger/test/format/format.h
+++ b/src/third_party/wiredtiger/test/format/format.h
@@ -28,6 +28,9 @@
 
 #include "test_util.h"
 
+#ifdef HAVE_SETRLIMIT
+#include <sys/resource.h>
+#endif
 #include <signal.h>
 
 #define EXTPATH "../../ext/" /* Extensions path */
@@ -349,6 +352,7 @@ WT_THREAD_RET random_kv(void *);
 void path_setup(const char *);
 int read_row_worker(WT_CURSOR *, uint64_t, WT_ITEM *, WT_ITEM *, bool);
 uint32_t rng(WT_RAND_STATE *);
+void set_core_off(void);
 void snap_init(TINFO *, uint64_t, bool);
 void snap_repeat_single(WT_CURSOR *, TINFO *);
 int snap_repeat_txn(WT_CURSOR *, TINFO *);
diff --git a/src/third_party/wiredtiger/test/format/format.sh b/src/third_party/wiredtiger/test/format/format.sh
new file mode 100755
index 00000000000..722df756afe
--- /dev/null
+++ b/src/third_party/wiredtiger/test/format/format.sh
@@ -0,0 +1,442 @@
+#! /bin/bash
+
+[ -z $BASH_VERSION ] && {
+	echo "$0 is a bash script: \$BASH_VERSION not set, exiting"
+	exit 1
+}
+
+name=$(basename $0)
+
+quit=0
+force_quit=0
+onintr()
+{
+	echo "$name: interrupted, cleaning up..."
+	force_quit=1
+}
+trap 'onintr' 2
+
+usage() {
+	echo "usage: $0 [-aFSv] [-c config] "
+	echo "    [-h home] [-j parallel-jobs] [-n total-jobs] [-t minutes] [format-configuration]"
+	echo
+	echo "    -a           abort/recovery testing (defaults to off)"
+	echo "    -c config    format configuration file (defaults to CONFIG.stress)"
+	echo "    -F           quit on first failure (defaults to off)"
+	echo "    -h home      run directory (defaults to .)"
+	echo "    -j parallel  jobs to execute in parallel (defaults to 8)"
+	echo "    -n total     total jobs to execute (defaults to no limit)"
+	echo "    -S           run smoke-test configurations (defaults to off)"
+	echo "    -t minutes   minutes to run (defaults to no limit)"
+	echo "    -v           verbose output (defaults to off)"
+	echo "    --           separates $name arguments from format arguments"
+
+	exit 1
+}
+
+# Smoke-tests.
+smoke_base_1="data_source=table rows=100000 threads=6 timer=4"
+smoke_base_2="$smoke_base_1 leaf_page_max=9 internal_page_max=9"
+smoke_list=(
+	# Three access methods.
+	"$smoke_base_1 file_type=fix"
+	"$smoke_base_1 file_type=row"
+	"$smoke_base_1 file_type=var"
+
+	# Huffman key/value encoding.
+	"$smoke_base_1 file_type=row huffman_key=1 huffman_value=1"
+	"$smoke_base_1 file_type=var huffman_key=1 huffman_value=1"
+
+	# Abort/recovery test.
+	"$smoke_base_1 file_type=row abort=1"
+
+	# LSM
+	"$smoke_base_1 file_type=row data_source=lsm"
+
+	# Force tree rebalance and the statistics server.
+	"$smoke_base_1 file_type=row statistics_server=1 rebalance=1"
+
+	# Overflow testing.
+	"$smoke_base_2 file_type=var value_min=256"
+	"$smoke_base_2 file_type=row key_min=256"
+	"$smoke_base_2 file_type=row key_min=256 value_min=256"
+)
+smoke_next=0
+
+abort_test=0
+build=""
+config="CONFIG.stress"
+first_failure=0
+format_args=""
+home="."
+minutes=0
+parallel_jobs=8
+smoke_test=0
+total_jobs=0
+verbose=0
+
+while :; do
+	case "$1" in
+	-a)
+		abort_test=1
+		shift ;;
+	-c)
+		config="$2"
+		shift ; shift ;;
+	-F)
+		first_failure=1
+		shift ;;
+	-h)
+		home="$2"
+		shift ; shift ;;
+	-j)
+		parallel_jobs="$2"
+		[[ "$parallel_jobs" =~ ^[1-9][0-9]*$ ]] || {
+			echo "$name: -j option argument must be a non-zero integer"
+			exit 1
+		}
+		shift ; shift ;;
+	-n)
+		total_jobs="$2"
+		[[ "$total_jobs" =~ ^[1-9][0-9]*$ ]] || {
+			echo "$name: -n option argument must be an non-zero integer"
+			exit 1
+		}
+		shift ; shift ;;
+	-S)
+		smoke_test=1
+		shift ;;
+	-t)
+		minutes="$2"
+		[[ "$minutes" =~ ^[1-9][0-9]*$ ]] || {
+			echo "$name: -t option argument must be a non-zero integer"
+			exit 1
+		}
+		shift ; shift ;;
+	-v)
+		verbose=1
+		shift ;;
+	--)
+		shift; break;;
+	-*)
+		usage ;;
+	*)
+		break ;;
+	esac
+done
+format_args="$*"
+
+verbose()
+{
+	[[ $verbose -ne 0 ]] && echo "$@"
+}
+
+verbose "$name: run starting at $(date)"
+
+# Find a component we need.
+# $1 name to find
+find_file()
+{
+	# Get the directory path to format.sh, which is always in wiredtiger/test/format, then
+	# use that as the base for all the other places we check.
+	d=$(dirname $0)
+
+	# Check wiredtiger/test/format/, likely location of the format binary and the CONFIG file.
+	f="$d/$1"
+	if [[ -f "$f" ]]; then
+		echo "$f"
+		return
+	fi
+
+	# Check wiredtiger/build_posix/test/format/, likely location of the format binary and the
+	# CONFIG file.
+	f="$d/../../build_posix/test/format/$1"
+	if [[ -f "$f" ]]; then
+		echo "$f"
+		return
+	fi
+
+	# Check wiredtiger/, likely location of the wt binary.
+	f="$d/../../$1"
+	if [[ -f "$f" ]]; then
+		echo "$f"
+		return
+	fi
+
+	# Check wiredtiger/build_posix/, likely location of the wt binary.
+	f="$d/../../build_posix/$1"
+	if [[ -f "$f" ]]; then
+		echo "$f"
+		return
+	fi
+
+	echo "./$1"
+}
+
+# Find the format and wt binaries (the latter is only required for abort/recovery testing),
+# the configuration file and the run directory.
+format_binary=$(find_file "t")
+[[ ! -x "$format_binary" ]] && {
+	echo "$name: format program \"$format_binary\" not found"
+	exit 1
+}
+[[ $abort_test -ne 0 ]] || [[ $smoke_test -ne 0 ]] && {
+	wt_binary=$(find_file "wt")
+	[[ ! -x "$wt_binary" ]] && {
+		echo "$name: wt program \"$wt_binary\" not found"
+		exit 1
+	}
+}
+config=$(find_file "$config")
+[[ -f "$config" ]] || {
+	echo "$name: configuration file \"$config\" not found"
+	exit 1
+}
+[[ -d "$home" ]] || {
+	echo "$name: directory \"$home\" not found"
+	exit 1
+}
+
+verbose "$name configuration: $format_binary [-c $config]\
+[-h $home] [-j $parallel_jobs] [-n $total_jobs] [-t $minutes] $format_args"
+
+failure=0
+success=0
+running=0
+status="format.sh-status"
+
+# Report a failure.
+# $1 directory name
+report_failure()
+{
+	dir=$1
+	log="$dir.log"
+
+	echo "$name: failure status reported" > $dir/$status
+	failure=$(($failure + 1))
+
+	# Forcibly quit if first-failure configured.
+	[[ $first_failure -ne 0 ]] && force_quit=1
+
+	echo "$name: job in $dir failed"
+	echo "$name: $dir log:"
+	sed 's/^/    > /' < $log
+}
+
+# Resolve/cleanup completed jobs.
+resolve()
+{
+	running=0
+	list=$(ls $home | grep '^RUNDIR.[0-9]*$')
+	for i in $list; do
+		dir="$home/$i"
+		log="$dir.log"
+
+		# Skip directories that aren't ours.
+		[[ ! -f "$log" ]] && continue
+
+		# Skip failures we've already reported.
+		[[ -f "$dir/$status" ]] && continue
+
+		# Get the process ID, ignore any jobs that aren't yet running.
+		pid=`grep -E 'process.*running' $log | awk '{print $3}'`
+		[[ "$pid" =~ ^[1-9][0-9]*$ ]] || continue
+
+		# Leave any process waiting for a gdb attach running, but report it as a failure.
+		grep -E 'waiting for debugger' $log > /dev/null && {
+			report_failure $dir
+			continue
+		}
+
+		# If the job is still running, ignore it unless we're forcibly quitting.
+		kill -s 0 $pid > /dev/null 2>&1 && {
+			[[ $force_quit -eq 0 ]] && {
+				running=$((running + 1))
+				continue
+			}
+			kill -s TERM $pid
+		}
+
+		# Wait for the job and get an exit status.
+		wait $pid
+		eret=$?
+
+		# Remove successful jobs.
+		grep 'successful run completed' $log > /dev/null && {
+			rm -rf $dir $log
+			success=$(($success + 1))
+			verbose "$name: job in $dir successfully completed"
+			continue
+		}
+
+		# Remove jobs we killed.
+		grep 'caught signal' $log > /dev/null && {
+			rm -rf $dir $log
+			verbose "$name: job in $dir signalled"
+			continue
+		}
+
+		# Test recovery on jobs configured for random abort. */
+		grep 'aborting to test recovery' $log > /dev/null && {
+			cp -pr $dir $dir.RECOVER
+
+			(echo
+			 echo "$name: running recovery after abort test"
+			 echo "$name: original directory copied into $dir.RECOVER"
+			 echo) >> $log
+
+			# Everything is a table unless explicitly a file.
+			uri="table:wt"
+			grep 'data_source=file' $dir/CONFIG > /dev/null && uri="file:wt"
+						
+			# Use the wt utility to recover & verify the object.
+			if  $($wt_binary -R -h $dir verify $uri >> $log 2>&1); then
+				rm -rf $dir $dir.RECOVER $log
+				success=$(($success + 1))
+				verbose "$name: job in $dir successfully completed"
+			else
+				echo "$name: job in $dir failed abort/recovery testing"
+				report_failure $dir
+			fi
+			continue
+		}
+
+		# Check for the library abort message, or an error from format.
+		grep -E 'aborting WiredTiger library|run FAILED' $log > /dev/null && {
+			report_failure $dir
+			continue
+		}
+
+		# There's some chance we just dropped core. We have the exit status of the process,
+		# but there's no way to be sure. There are reasons the process' exit status looks
+		# like a core dump was created (format deliberately causes a segfault in the case
+		# of abort/recovery testing, and does work that can often segfault in the case of a
+		# snapshot-isolation mismatch failure), but those cases have already been handled,
+		# format is responsible for logging a failure before the core can happen. If the
+		# process exited with a likely failure, call it a failure.
+		signame=""
+		case $eret in
+		$((128 + 3)))
+			signame="SIGQUIT";;
+		$((128 + 4)))
+			signame="SIGILL";;
+		$((128 + 6)))
+			signame="SIGABRT";;
+		$((128 + 7)))
+			signame="SIGBUS";;
+		$((128 + 8)))
+			signame="SIGFPE";;
+		$((128 + 11)))
+			signame="SIGSEGV";;
+		$((128 + 24)))
+			signame="SIGXCPU";;
+		$((128 + 25)))
+			signame="SIGXFSZ";;
+		$((128 + 31)))
+			signame="SIGSYS";;
+		esac
+		[[ ! -z $signame ]] && {
+			(echo
+			 echo "$name: job in $dir killed with signal $signame"
+			 echo "$name: there may be a core dump associated with this failure"
+			 echo) >> $log
+
+			echo "$name: job in $dir killed with signal $signame"
+			echo "$name: there may be a core dump associated with this failure"
+
+			report_failure $dir
+			continue
+		}
+
+	done
+	return 0
+}
+
+# Start a single job.
+count_jobs=0
+format()
+{
+	count_jobs=$(($count_jobs + 1))
+	dir="$home/RUNDIR.$count_jobs"
+	log="$dir.log"
+
+	if [[ $smoke_test -ne 0 ]]; then
+		args=${smoke_list[$smoke_next]}
+		smoke_next=$(($smoke_next + 1))
+		echo "$name: starting smoke-test job in $dir"
+	else
+		args=$format_args
+
+		# If abort/recovery testing is configured, do it 5% of the time.
+		[[ $abort_test -ne 0 ]] && [[ $(($count_jobs % 20)) -eq 0 ]] && args="$args abort=1"
+
+		echo "$name: starting job in $dir"
+	fi
+
+	cmd="$format_binary -c "$config" -h "$dir" -1 $args quiet=1"
+	verbose "$name: $cmd"
+
+	# Disassociate the command from the shell script so we can exit and let the command
+	# continue to run.
+	nohup $cmd > $log 2>&1 &
+}
+
+seconds=$((minutes * 60))
+start_time="$(date -u +%s)"
+while :; do
+	# Check if our time has expired.
+	[[ $seconds -ne 0 ]] && {
+		now="$(date -u +%s)"
+		elapsed=$(($now - $start_time))
+
+		# If we've run out of time, terminate all running jobs.
+		[[ $elapsed -ge $seconds ]] && {
+			verbose "$name: run timed out at $(date)"
+			force_quit=1
+		}
+	}
+
+	# Start more jobs.
+	while :; do
+		# Check if we're only running the smoke-tests and we're done.
+		[[ $smoke_test -ne 0 ]] && [[ $smoke_next -ge ${#smoke_list[@]} ]] && quit=1
+	
+		# Check if the total number of jobs has been reached.
+		[[ $total_jobs -ne 0 ]] && [[ $count_jobs -ge $total_jobs ]] && quit=1
+
+		# Check if less than 60 seconds left on any timer. The goal is to avoid killing
+		# jobs that haven't yet configured signal handlers, because we rely on handler
+		# output to determine their final status.
+		[[ $seconds -ne 0 ]] && [[ $(($seconds - $elapsed)) -lt 60 ]] && quit=1
+
+		# Don't create more jobs if we're quitting for any reason.
+		[[ $force_quit -ne 0 ]] || [[ $quit -ne 0 ]] && break;
+
+		# Check if the maximum number of jobs in parallel has been reached.
+		[[ $running -ge $parallel_jobs ]] && break
+		running=$(($running + 1))
+
+		# Start another job, but don't pound on the system.
+		format
+		sleep 2
+	done
+
+	# Clean up and update status.
+	success_save=$success
+	failure_save=$failure
+	resolve
+	[[ $success -ne $success_save ]] || [[ $failure -ne $failure_save ]] &&
+	    echo "$name: $success successful jobs, $failure failed jobs"
+
+	# Quit if we're done and there aren't any jobs left to wait for.
+	[[ $quit -ne 0 ]] || [[ $force_quit -ne 0 ]] && [[ $running -eq 0 ]] && break
+
+	# Wait for awhile, unless there are jobs to start.
+	[[ $running -ge $parallel_jobs ]] && sleep 10
+done
+
+echo "$name: $success successful jobs, $failure failed jobs"
+
+verbose "$name: run ending at $(date)"
+[[ $failure -ne 0 ]] && exit 1
+exit 0
diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c
index f136372260c..d74e5cda0c0 100644
--- a/src/third_party/wiredtiger/test/format/ops.c
+++ b/src/third_party/wiredtiger/test/format/ops.c
@@ -60,6 +60,10 @@ modify_repl_init(void)
         modify_repl[i] = "zyxwvutsrqponmlkjihgfedcba"[i % 26];
 }
 
+/*
+ * set_alarm --
+ *     Set a timer.
+ */
 static void
 set_alarm(void)
 {
@@ -75,6 +79,41 @@ set_alarm(void)
 #endif
 }
 
+/*
+ * set_core_off --
+ *     Turn off core dumps.
+ */
+void
+set_core_off(void)
+{
+#ifdef HAVE_SETRLIMIT
+    struct rlimit rlim;
+
+    rlim.rlim_cur = rlim.rlim_max = 0;
+    testutil_check(setrlimit(RLIMIT_CORE, &rlim));
+#endif
+}
+
+/*
+ * random_failure --
+ *     Fail the process.
+ */
+static void
+random_failure(void)
+{
+    static char *core = NULL;
+
+    /* Let our caller know. */
+    printf("%s: aborting to test recovery\n", progname);
+    fflush(stdout);
+
+    /* Turn off core dumps. */
+    set_core_off();
+
+    /* Fail at a random moment. */
+    *core = 0;
+}
+
 TINFO **tinfo_list;
 
 /*
@@ -222,10 +261,8 @@ wts_ops(bool lastrun)
                 /*
                  * On the last execution, optionally drop core for recovery testing.
                  */
-                if (lastrun && g.c_abort) {
-                    static char *core = NULL;
-                    *core = 0;
-                }
+                if (lastrun && g.c_abort)
+                    random_failure();
                 tinfo->quit = true;
             }
         }
diff --git a/src/third_party/wiredtiger/test/format/snap.c b/src/third_party/wiredtiger/test/format/snap.c
index eed296e212f..15df14b71dc 100644
--- a/src/third_party/wiredtiger/test/format/snap.c
+++ b/src/third_party/wiredtiger/test/format/snap.c
@@ -229,14 +229,16 @@ snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap)
 #ifdef HAVE_DIAGNOSTIC
     /*
      * We have a mismatch. Try to print out as much information as we can. In doing so, we are
-     * calling into the debug code directly and that does not take locks. So it is possible that the
-     * calls may crash in some way.
-     *
-     * The most important information is the key/value mismatch information. Then try to dump out
-     * the other information. Right now we dump the entire lookaside table including what is on
-     * disk. That can potentially be very large. If it becomes a problem, this can be modified to
-     * just dump out the page this key is on.
+     * calling into the debug code directly and that does not take locks, so it's possible we will
+     * simply drop core. The most important information is the key/value mismatch information. Then
+     * try to dump out the other information. Right now we dump the entire lookaside table including
+     * what is on disk. That can potentially be very large. If it becomes a problem, this can be
+     * modified to just dump out the page this key is on. Write a failure message into the log file
+     * first so format.sh knows we failed, and turn off core dumps.
      */
+    fprintf(stderr, "\n%s: run FAILED\n", progname);
+    set_core_off();
+
     fprintf(stderr, "snapshot-isolation error: Dumping page to %s\n", g.home_pagedump);
     testutil_check(__wt_debug_cursor_page(cursor, g.home_pagedump));
     fprintf(stderr, "snapshot-isolation error: Dumping LAS to %s\n", g.home_lasdump);
@@ -244,16 +246,8 @@ snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap)
     if (g.logging)
         testutil_check(cursor->session->log_flush(cursor->session, "sync=off"));
 #endif
-    switch (g.type) {
-    case FIX:
-    case VAR:
-        testutil_die(ret, "snapshot-isolation: %" PRIu64 " search mismatch", keyno);
-    /* NOTREACHED */
-    case ROW:
-        testutil_die(
-          ret, "snapshot-isolation: %.*s search mismatch", (int)key->size, (char *)key->data);
-        /* NOTREACHED */
-    }
+
+    testutil_assert(0);
 
     /* NOTREACHED */
     return (1);
diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c
index 7a43ca9f9b4..7ddfe37191c 100644
--- a/src/third_party/wiredtiger/test/format/t.c
+++ b/src/third_party/wiredtiger/test/format/t.c
@@ -38,15 +38,29 @@ extern int __wt_optind;
 extern char *__wt_optarg;
 
 /*
+ * signal_timer --
+ *     Alarm signal handler, report the signal and drop core.
+ */
+static void signal_timer(int signo) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
+static void
+signal_timer(int signo)
+{
+    fprintf(stderr, "format caught signal %d, aborting the process\n", signo);
+    fflush(stderr);
+    __wt_abort(NULL);
+}
+
+/*
  * signal_handler --
- *     Handle signals.
+ *     Generic signal handler, report the signal and exit.
  */
 static void signal_handler(int signo) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
 static void
 signal_handler(int signo)
 {
-    fprintf(stderr, "format caught signal %d, aborting the process\n", signo);
-    __wt_abort(NULL);
+    fprintf(stderr, "format caught signal %d, exiting\n", signo);
+    fflush(stderr);
+    exit(0);
 }
 
 int
@@ -64,9 +78,10 @@ main(int argc, char *argv[])
 
 /*
  * Windows and Linux support different sets of signals, be conservative about installing handlers.
+ * If we time out, we want a core dump, otherwise, just exit.
  */
 #ifdef SIGALRM
-    (void)signal(SIGALRM, signal_handler);
+    (void)signal(SIGALRM, signal_timer);
 #endif
 #ifdef SIGHUP
     (void)signal(SIGHUP, signal_handler);
@@ -179,7 +194,8 @@ main(int argc, char *argv[])
     testutil_check(pthread_rwlock_init(&g.death_lock, NULL));
     testutil_check(pthread_rwlock_init(&g.ts_lock, NULL));
 
-    printf("%s: process %" PRIdMAX "\n", progname, (intmax_t)getpid());
+    printf("%s: process %" PRIdMAX " running\n", progname, (intmax_t)getpid());
+    fflush(stdout);
     while (++g.run_cnt <= g.c_runs || g.c_runs == 0) {
         startup(); /* Start a run */
 
@@ -260,6 +276,8 @@ main(int argc, char *argv[])
 
     config_clear();
 
+    printf("%s: successful run completed\n", progname);
+
     return (EXIT_SUCCESS);
 }
 
@@ -314,7 +332,7 @@ format_die(void)
     fclose_and_clear(&g.logfp);
     fclose_and_clear(&g.randfp);
 
-    fprintf(stderr, "\n");
+    fprintf(stderr, "\n%s: run FAILED\n", progname);
 
     /* Display the configuration that failed. */
     if (g.run_cnt)
author	Luke Chen <luke.chen@mongodb.com>	2019-12-03 03:01:41 +0000
committer	evergreen <evergreen@mongodb.com>	2019-12-03 03:01:41 +0000
commit	587f15f0f823924c852b261497110e4b78dca7fe (patch)
tree	d2e92233b4d39b061729597b938c42b67502eaa7 /src/third_party
parent	2e948c4e94b17089ab56a5437447f9988c31103d (diff)
download	mongo-587f15f0f823924c852b261497110e4b78dca7fe.tar.gz