SERVER-51598 Add new abort_multi_stmt_txn_test suites

author: Jason Chan <jason.chan@10gen.com> 2020-11-10 12:07:35 -0500
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2020-11-12 07:02:09 +0000
commit: e399bf8689f592129c9655933bdb6a0e551a47b8 (patch)
tree: 0609ae3f7617076eb13c6d7f38453f44a74cfdac /buildscripts
parent: 0ed9f1bf7d27e43174ce806291a597947b1f98ae (diff)
download: mongo-e399bf8689f592129c9655933bdb6a0e551a47b8.tar.gz
5 files changed, 453 insertions, 0 deletions
diff --git a/buildscripts/resmokeconfig/suites/concurrency_replication_abort_multi_stmt_txn.yml b/buildscripts/resmokeconfig/suites/concurrency_replication_abort_multi_stmt_txn.yml
new file mode 100644
index 00000000000..6b24de0bbf3
--- /dev/null
+++ b/buildscripts/resmokeconfig/suites/concurrency_replication_abort_multi_stmt_txn.yml
@@ -0,0 +1,77 @@
+# A test suite intended to attempt to reproduce the crash in HELP-19266. This test suite will abort
+# transactions by setting a smaller expiration time. This test suite will ignore all assertion
+# and command failure errors and should only fail on a system crash, a hang, or a data consistency
+# error.
+
+test_kind: abort_txns_fsm_workload_test
+
+selector:
+  roots:
+  - jstests/concurrency/fsm_workloads/**/*.js
+  exclude_files:
+  ##
+  # Disabled due to MongoDB restrictions and/or workload restrictions
+  ##
+  # These workloads use >100MB of data, which can overwhelm test hosts.
+  - jstests/concurrency/fsm_workloads/agg_group_external.js
+  - jstests/concurrency/fsm_workloads/agg_sort_external.js
+  # The findAndModify_update_grow.js workload can cause OOM kills on test hosts.
+  - jstests/concurrency/fsm_workloads/findAndModify_update_grow.js
+
+  # Creates a cursor in one state function and uses it in a different state function, which means
+  # that in this suite it attempts to use the same cursor in multiple transactions.
+  - jstests/concurrency/fsm_workloads/invalidated_cursors.js
+  - jstests/concurrency/fsm_workloads/globally_managed_cursors.js
+  - jstests/concurrency/fsm_workloads/kill_multicollection_aggregation.js
+
+  # Relies on having one thread observe writes from the other threads, which won't become visible
+  # once a transaction in the thread is started because it'll keep reading from the same snapshot.
+  - jstests/concurrency/fsm_workloads/create_index_background.js
+
+  exclude_with_any_tags:
+  - requires_sharding
+
+  # Tests which expect commands to fail and catch the error can cause transactions to abort and
+  # retry indefinitely.
+  - catches_command_failures
+
+executor:
+  archive:
+    hooks:
+      - CheckReplDBHashInBackground
+      - CheckReplDBHash
+      - ValidateCollections
+    tests: true
+  config:
+    shell_options:
+      # Ignore assertion failures from the shell in this test suite.
+      eval: doassert = Function.prototype;
+      readMode: commands
+      global_vars:
+        TestData:
+          runInsideTransaction: true
+          runningWithSessions: true
+  hooks:
+  # The CheckReplDBHash hook waits until all operations have replicated to and have been applied
+  # on the secondaries, so we run the ValidateCollections hook after it to ensure we're
+  # validating the entire contents of the collection.
+  #
+  # We don't run the CheckReplDBHashInBackground in this suite as it uses transactions to get the
+  # db hashes. The hook will throw if the transaction is aborted due to the shortened expiration
+  # lifetime in this suite.
+  # TODO SERVER-26466: Add CheckReplOplogs hook to the concurrency suite.
+  - class: CheckReplDBHash
+  - class: ValidateCollections
+  - class: CleanupConcurrencyWorkloads
+  fixture:
+    class: ReplicaSetFixture
+    mongod_options:
+      oplogSize: 1024
+      set_parameters:
+        enableTestCommands: 1
+        numInitialSyncAttempts: 1
+        failpoint.setTransactionLifetimeToRandomMillis:
+          mode: alwaysOn
+        failpoint.increaseFrequencyOfPeriodicThreadToExpireTransactions:
+          mode: alwaysOn
+    num_nodes: 3
diff --git a/buildscripts/resmokeconfig/suites/replica_sets_abort_multi_stmt_txn_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/replica_sets_abort_multi_stmt_txn_jscore_passthrough.yml
new file mode 100644
index 00000000000..edc974bb5a8
--- /dev/null
+++ b/buildscripts/resmokeconfig/suites/replica_sets_abort_multi_stmt_txn_jscore_passthrough.yml
@@ -0,0 +1,320 @@
+# A test suite intended to attempt to reproduce the crash in HELP-19266. This test suite will abort
+# transactions by setting a smaller expiration time. This test suite will ignore all assertion
+# and command failure errors and should only fail on a system crash, a hang, or a data consistency
+# error.
+
+test_kind: abort_multi_stmt_txn_passthrough
+
+selector:
+  roots:
+  - jstests/core/**/*.js
+  # TODO: SERVER-35089
+  # - jstests/libs/txns/txn_passthrough_runner_selftest.js
+  exclude_files:
+  # TODO: SERVER-35089
+  - jstests/core/geo_allowedcomparisons.js
+  - jstests/core/geo_big_polygon2.js
+  - jstests/core/in.js
+  - jstests/core/orj.js
+  - jstests/core/insert1.js
+
+  # These tests already run with transactions.
+  - jstests/core/txns/*.js
+
+  # These tests are not expected to pass with replica-sets:
+  - jstests/core/capped_update.js
+  - jstests/core/dbadmin.js
+  - jstests/core/opcounters_write_cmd.js
+  - jstests/core/read_after_optime.js
+
+  ##
+  ## Limitations with the way the runner file injects transactions.
+  ##
+  
+  # These tests expects some statements to error, which will cause txns to abort entirely.
+  - jstests/core/bulk_api_ordered.js
+  - jstests/core/bulk_api_unordered.js
+  - jstests/core/bulk_legacy_enforce_gle.js
+  - jstests/core/capped5.js
+  - jstests/core/commands_with_uuid.js
+  - jstests/core/explain_execution_error.js
+  - jstests/core/expr.js
+  - jstests/core/find_and_modify_invalid_query_params.js
+  - jstests/core/find_getmore_bsonsize.js
+  - jstests/core/find_getmore_cmd.js
+  - jstests/core/find9.js
+  - jstests/core/index_big1.js
+  - jstests/core/index_bigkeys.js
+  - jstests/core/index_decimal.js
+  - jstests/core/index_multiple_compatibility.js
+  - jstests/core/index_partial_write_ops.js
+  - jstests/core/index8.js # No explicit check for failed command.
+  - jstests/core/indexa.js # No explicit check for failed command.
+  - jstests/core/indexes_multiple_commands.js
+  - jstests/core/insert_long_index_key.js
+  - jstests/core/js2.js
+  - jstests/core/json_schema/json_schema.js
+  - jstests/core/mr_bigobject.js
+  - jstests/core/not2.js
+  - jstests/core/notablescan.js
+  - jstests/core/or1.js
+  - jstests/core/or2.js
+  - jstests/core/or3.js
+  - jstests/core/ork.js
+  - jstests/core/ref4.js
+  - jstests/core/regex_limit.js
+  - jstests/core/remove_undefined.js
+  - jstests/core/set7.js 
+  - jstests/core/sortb.js
+  - jstests/core/sortf.js
+  - jstests/core/sortg.js
+  - jstests/core/sortj.js
+  - jstests/core/tailable_skip_limit.js
+  - jstests/core/type_array.js
+  - jstests/core/uniqueness.js
+  - jstests/core/unset2.js
+  - jstests/core/update_addToSet.js
+  - jstests/core/update_arrayFilters.js
+  - jstests/core/update_find_and_modify_id.js
+  - jstests/core/update_modifier_pop.js
+  - jstests/core/updateh.js
+  - jstests/core/updatej.js
+  - jstests/core/ref.js
+
+  # Consecutive writes totalling more than 16MB will cause the txn to abort with
+  # a TransactionTooLarge error.
+  - jstests/core/batch_size.js
+  - jstests/core/single_batch.js
+
+  ##
+  ## Some aggregation stages don't support snapshot readconcern.
+  ##
+
+  # $explain (requires read concern local)
+  - jstests/core/agg_hint.js
+  - jstests/core/and.js
+  - jstests/core/collation.js
+  - jstests/core/explain_shell_helpers.js
+  - jstests/core/index_partial_read_ops.js
+  - jstests/core/optimized_match_explain.js
+  - jstests/core/sort_array.js
+  - jstests/core/views/views_collation.js
+
+  # $out
+  - jstests/core/bypass_doc_validation.js
+  - jstests/core/views/views_aggregation.js
+
+  # $listSessions
+  - jstests/core/list_all_local_sessions.js
+  - jstests/core/list_all_sessions.js
+  - jstests/core/list_local_sessions.js
+  - jstests/core/list_sessions.js
+
+  # $indexStats
+  - jstests/core/index_stats.js
+
+  # $collStats
+  - jstests/core/operation_latency_histogram.js
+  - jstests/core/views/views_coll_stats.js
+  - jstests/core/views/views_stats.js
+
+  ##
+  ## WriteErrors get converted to WriteCommandErrors if part of a txn.
+  ##
+
+  # The same error code, but with ok:0.
+  - jstests/core/json_schema/additional_items.js
+  - jstests/core/json_schema/additional_properties.js
+  - jstests/core/json_schema/bsontype.js
+  - jstests/core/json_schema/dependencies.js
+  - jstests/core/json_schema/items.js
+  - jstests/core/json_schema/logical_keywords.js
+  - jstests/core/json_schema/min_max_items.js
+  - jstests/core/json_schema/min_max_properties.js
+  - jstests/core/json_schema/pattern_properties.js
+  - jstests/core/json_schema/required.js
+  - jstests/core/json_schema/unique_items.js
+
+  - jstests/core/field_name_validation.js
+  - jstests/core/fts_array.js
+  - jstests/core/inc-SERVER-7446.js
+  - jstests/core/invalid_db_name.js
+  - jstests/core/push_sort.js
+
+  # Checks for "WriteErrors" explicitly from the response of db.runCommand()
+  - jstests/core/max_doc_size.js
+
+  # Calls res.getWriteError() or res.hasWriteError().
+  - jstests/core/bulk_api_ordered.js
+  - jstests/core/bulk_api_unordered.js
+  - jstests/core/bulk_legacy_enforce_gle.js
+  - jstests/core/cappeda.js
+  - jstests/core/doc_validation.js
+  - jstests/core/doc_validation_options.js
+  - jstests/core/geo_multinest0.js
+  - jstests/core/insert_illegal_doc.js
+  - jstests/core/ns_length.js
+  - jstests/core/push2.js
+  - jstests/core/remove6.js
+  - jstests/core/removeb.js
+  - jstests/core/rename4.js
+  - jstests/core/shell_writeconcern.js
+  - jstests/core/storefunc.js
+  - jstests/core/update_arrayFilters.js
+  - jstests/core/update_dbref.js
+  - jstests/core/updatel.js
+  - jstests/core/write_result.js
+
+  # Multiple writes in a txn, some of which fail because the collection doesn't exist.
+  # We create the collection and retry the last write, but previous writes would have
+  # still failed.
+  - jstests/core/dbref1.js
+  - jstests/core/dbref2.js
+  - jstests/core/ref3.js
+  - jstests/core/repair_database.js
+  - jstests/core/update3.js
+  - jstests/core/rename3.js
+
+  ##
+  ## Error: Unable to acquire lock within a max lock request timeout of '0ms' milliseconds
+  ##
+
+  # Collection drops done through applyOps are not converted to w:majority
+  - jstests/core/views/invalid_system_views.js
+
+  # Operations run on the "out" collection of a MapReduce call, which is not always
+  # immediately available to a transaction as it is still being replicated. Transactions
+  # fail with "Unable to acquire lock" errors.
+  - jstests/core/function_string_representations.js
+  - jstests/core/mr_errorhandling.js
+  - jstests/core/mr_merge.js
+  - jstests/core/mr_merge2.js
+  - jstests/core/mr_replaceIntoDB.js
+  - jstests/core/mr_outreduce.js
+  - jstests/core/mr_outreduce2.js
+
+  ##
+  ## Misc. reasons.
+  ##
+
+  # SERVER-34868 Cannot run a legacy query on a session.
+  - jstests/core/exhaust.js
+  - jstests/core/validate_cmd_ns.js
+
+  # SERVER-34772 Tailable Cursors are not allowed with snapshot readconcern.
+  - jstests/core/awaitdata_getmore_cmd.js
+  - jstests/core/getmore_cmd_maxtimems.js
+  - jstests/core/tailable_cursor_invalidation.js
+  - jstests/core/tailable_getmore_batch_size.js
+
+  # SERVER-34918 The "max" option of a capped collection can be exceeded until the next insert.
+  # The reason is that we don't update the count of a collection until a transaction commits,
+  # by which point it is too late to complain that "max" has been exceeded.
+  - jstests/core/capped_max1.js
+
+  # The "max" option of a capped collection can be temporarily exceeded before a
+  # txn is committed.
+  - jstests/core/bulk_insert_capped.js
+
+  # Wrong count for top info (WriteLock)
+  - jstests/core/top.js
+
+  # Expects collection to not have been created
+  - jstests/core/insert_id_undefined.js
+
+  # Creates sessions explicitly, resulting in txns being run through different sessions
+  # using a single txnNumber.
+  - jstests/core/list_all_local_cursors.js
+  - jstests/core/json_schema/misc_validation.js
+  - jstests/core/views/views_all_commands.js
+
+  # Fails with implicit sessions because it will use multiple sessions on the same Mongo connection.
+  - jstests/core/dropdb.js
+
+  # Committing a transaction when the server is fsync locked fails.
+  - jstests/core/fsync.js
+
+  # Expects legacy errors ($err).
+  - jstests/core/constructors.js
+
+  # txn interrupted by command outside of txn before getMore runs.
+  - jstests/core/commands_namespace_parsing.js
+  - jstests/core/drop3.js
+  - jstests/core/ensure_sorted.js
+  - jstests/core/geo_s2cursorlimitskip.js
+  - jstests/core/getmore_invalidated_cursors.js
+  - jstests/core/getmore_invalidated_documents.js
+  - jstests/core/kill_cursors.js
+  - jstests/core/list_collections1.js
+  - jstests/core/list_indexes.js
+  - jstests/core/oro.js
+
+  # Expects certain number of operations in the system.profile collection.
+  - jstests/core/profile*.js
+
+  # Parallel Shell - we do not signal the override to end a txn when a parallel shell closes.
+  - jstests/core/awaitdata_getmore_cmd.js
+  - jstests/core/compact_keeps_indexes.js
+  - jstests/core/count10.js
+  - jstests/core/count_plan_summary.js
+  - jstests/core/coveredIndex3.js
+  - jstests/core/currentop.js
+  - jstests/core/distinct3.js
+  - jstests/core/evald.js
+  - jstests/core/find_and_modify_concurrent_update.js
+  - jstests/core/fsync.js
+  - jstests/core/geo_update_btree.js
+  - jstests/core/killop_drop_collection.js
+  - jstests/core/loadserverscripts.js
+  - jstests/core/mr_killop.js
+  - jstests/core/remove9.js
+  - jstests/core/removeb.js
+  - jstests/core/removec.js
+  - jstests/core/shellstartparallel.js
+  - jstests/core/updatef.js
+
+  # Command expects to see result from parallel operation.
+  # E.g. Suppose the following sequence of events: op1, join() op2 in parallel shell, op3.
+  # op3 will still be using the snapshot from op1, and not see op2 at all.
+  - jstests/core/cursora.js
+  - jstests/core/bench_test1.js
+
+  # It is illegal to open a tailable cursor in a transaction
+  - jstests/core/geo_near_tailable.js
+
+  exclude_with_any_tags:
+  # "Cowardly refusing to override read concern of command: ..."
+  - assumes_read_concern_unchanged
+  # "writeConcern is not allowed within a multi-statement transaction"
+  - assumes_write_concern_unchanged
+
+executor:
+  config:
+    shell_options:
+      # Ignore assertion failures from the shell in this test suite.
+      eval: var testingReplication = true; doassert = Function.prototype;
+      global_vars:
+        TestData:
+          sessionOptions:
+            causalConsistency: false
+      readMode: commands
+  hooks:
+  # The CheckReplDBHash hook waits until all operations have replicated to and have been applied
+  # on the secondaries, so we run the ValidateCollections hook after it to ensure we're
+  # validating the entire contents of the collection.
+  - class: CheckReplOplogs
+  - class: CheckReplDBHash
+  - class: ValidateCollections
+  - class: CleanEveryN
+    n: 20
+  fixture:
+    class: ReplicaSetFixture
+    mongod_options:
+      set_parameters:
+        enableTestCommands: 1
+        numInitialSyncAttempts: 1
+        failpoint.setTransactionLifetimeToRandomMillis:
+          mode: alwaysOn
+        failpoint.increaseFrequencyOfPeriodicThreadToExpireTransactions:
+          mode: alwaysOn
+    num_nodes: 2
+\ No newline at end of file
diff --git a/buildscripts/resmokelib/selector.py b/buildscripts/resmokelib/selector.py
index 0ed27403cbc..cbd1eb86bb3 100644
--- a/buildscripts/resmokelib/selector.py
+++ b/buildscripts/resmokelib/selector.py
@@ -679,11 +679,13 @@ _SELECTOR_REGISTRY = {
     "db_test": (_DbTestSelectorConfig, _DbTestSelector),
     "fsm_workload_test": (_JSTestSelectorConfig, _JSTestSelector),
     "parallel_fsm_workload_test": (_MultiJSTestSelectorConfig, _MultiJSTestSelector),
+    "abort_txns_fsm_workload_test": (_JSTestSelectorConfig, _JSTestSelector),
     "json_schema_test": (_JsonTestSelectorConfig, _Selector),
     "js_test": (_JSTestSelectorConfig, _JSTestSelector),
     "multi_stmt_txn_passthrough": (_JSTestSelectorConfig, _JSTestSelector),
     "py_test": (_PyTestCaseSelectorConfig, _Selector),
     "sleep_test": (_SleepTestCaseSelectorConfig, _SleepTestCaseSelector),
+    "abort_multi_stmt_txn_passthrough": (_JSTestSelectorConfig, _JSTestSelector),
 }
 
 
diff --git a/buildscripts/resmokelib/testing/testcases/fsm_workload_test.py b/buildscripts/resmokelib/testing/testcases/fsm_workload_test.py
index 6cbda33abe9..f2cf17dd9c3 100644
--- a/buildscripts/resmokelib/testing/testcases/fsm_workload_test.py
+++ b/buildscripts/resmokelib/testing/testcases/fsm_workload_test.py
@@ -99,3 +99,26 @@ class ParallelFSMWorkloadTestCase(FSMWorkloadTestCase):
         for workload_name in sorted(selected_tests):
             uid.update(workload_name)
         return uid.hexdigest()
+
+
+class AbortTxnsFSMWorkloadTestCase(FSMWorkloadTestCase):
+    """An FSM workload intended to be used by test suites that test transaction expiration logic."""
+
+    REGISTERED_NAME = "abort_txns_fsm_workload_test"
+
+    def _execute(self, process):
+        """Run the specified process."""
+        self.logger.info("Starting %s...\n%s", self.short_description(), process.as_command())
+
+        process.start()
+        self.logger.info("%s started with pid %s.", self.short_description(), process.pid)
+
+        self.return_code = process.wait()
+        # This test case is intended to randomly abort transactions in the core passthrough. We only
+        # expect to return a failure when the system crashes. This is different from the base
+        # implementation where we will throw in a non-zero return code.
+        if self.return_code != 0:
+            self.logger.info("Returning quietly instead of throwing failure: %s" %
+                             (self.short_description()))
+
+        self.logger.info("%s finished.", self.short_description())
diff --git a/buildscripts/resmokelib/testing/testcases/multi_stmt_txn_test.py b/buildscripts/resmokelib/testing/testcases/multi_stmt_txn_test.py
index 1e790612153..83c658557a6 100644
--- a/buildscripts/resmokelib/testing/testcases/multi_stmt_txn_test.py
+++ b/buildscripts/resmokelib/testing/testcases/multi_stmt_txn_test.py
@@ -27,3 +27,34 @@ class MultiStmtTxnTestCase(jsrunnerfile.JSRunnerFileTestCase):
 
     def _populate_test_data(self, test_data):
         test_data["multiStmtTxnTestFile"] = self.multi_stmt_txn_test_file
+
+
+class AbortMultiStmtTxnTestCase(MultiStmtTxnTestCase):
+    """Test case for aborting multi statement transactions."""
+
+    REGISTERED_NAME = "abort_multi_stmt_txn_passthrough"
+
+    def __init__(self, logger, multi_stmt_txn_test_file, shell_executable=None, shell_options=None):
+        """Initialize AbortMultiStmtTxnTestCase to be used to test transaction expiration logic."""
+        # pylint: disable=non-parent-init-called,super-init-not-called
+        jsrunnerfile.JSRunnerFileTestCase.__init__(
+            self, logger, "Abort Multi-statement Transaction Passthrough", multi_stmt_txn_test_file,
+            test_runner_file="jstests/libs/txns/txn_passthrough_runner.js",
+            shell_executable=shell_executable, shell_options=shell_options)
+
+    def _execute(self, process):
+        """Run the specified process."""
+        self.logger.info("Starting %s...\n%s", self.short_description(), process.as_command())
+
+        process.start()
+        self.logger.info("%s started with pid %s.", self.short_description(), process.pid)
+
+        self.return_code = process.wait()
+        # This test case is intended to randomly abort transactions in the core passthrough. We only
+        # expect to return a failure when the system crashes. This is different from the base
+        # implementation where we will throw in a non-zero return code.
+        if self.return_code != 0:
+            self.logger.info("Returning quietly instead of throwing failure: %s" %
+                             (self.short_description()))
+
+        self.logger.info("%s finished.", self.short_description())
author	Jason Chan <jason.chan@10gen.com>	2020-11-10 12:07:35 -0500
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2020-11-12 07:02:09 +0000
commit	e399bf8689f592129c9655933bdb6a0e551a47b8 (patch)
tree	0609ae3f7617076eb13c6d7f38453f44a74cfdac /buildscripts
parent	0ed9f1bf7d27e43174ce806291a597947b1f98ae (diff)
download	mongo-e399bf8689f592129c9655933bdb6a0e551a47b8.tar.gz