diff options
author | XueruiFa <xuerui.fa@mongodb.com> | 2021-03-15 21:29:33 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-04-08 14:46:19 +0000 |
commit | 8cc71b9b34bb6f647993163c766d387b6dc08ae0 (patch) | |
tree | 0c4acc743e20db8d91c5fe2f1c9eb9c4e3b58f30 | |
parent | b493b9b63c6847ca7482da01d39871920a08c165 (diff) | |
download | mongo-8cc71b9b34bb6f647993163c766d387b6dc08ae0.tar.gz |
SERVER-53807: Add tenant migrations passthrough coverage for transactions
(cherry picked from commit 310744312aa7554f69ea531fa478acfd991d1a5a)
4 files changed, 448 insertions, 18 deletions
diff --git a/buildscripts/resmokeconfig/suites/tenant_migration_multi_stmt_txn_jscore_passthrough.yml b/buildscripts/resmokeconfig/suites/tenant_migration_multi_stmt_txn_jscore_passthrough.yml new file mode 100644 index 00000000000..d87b8bc0695 --- /dev/null +++ b/buildscripts/resmokeconfig/suites/tenant_migration_multi_stmt_txn_jscore_passthrough.yml @@ -0,0 +1,407 @@ +test_kind: js_test +# This suite starts two 3-node replica sets and uses the ContinuousTenantMigration hook to run +# background tenant migrations. It also wraps all CRUD commands in transactions and asserts that +# the transactions are committed/aborted correctly while tenant migrations are running. + +selector: + roots: + - jstests/core/**/*.js + + exclude_files: + # + # Excluded from tenant_migration_jscore_passthrough.yml + # + + # These tests already run with transactions. + - jstests/core/txns/**/*.js + # These tests depend on hardcoded database name equality. + - jstests/core/json_schema/misc_validation.js + - jstests/core/list_databases.js + - jstests/core/profile1.js + - jstests/core/profile3.js + - jstests/core/views/views_stats.js + # These tests contain assertions on the number of executed operations and this suite retries + # operations on TenantMigrationAborted errors. + - jstests/core/find_and_modify_metrics.js + - jstests/core/update_metrics.js + - jstests/core/operation_latency_histogram.js + - jstests/core/top.js + - jstests/core/profile_sampling.js + - jstests/core/profile_hide_index.js + # The override cannot deep copy very large or small dates. + - jstests/core/index_large_and_small_dates.js + # This test expects that the connection (i.e. 'threadName') does not change throughout each test + # case. That is not always true when there is a background tenant migration. + - jstests/core/failcommand_failpoint.js + # The set_param1.js test attempts to compare the response from running the {getParameter: "*"} + # command multiple times, which may observe the change to the failpoint enabled by the migration + # hook. + - jstests/core/set_param1.js + # This test does not support tojson of command objects so the override cannot deep copy the + # command objects correctly. + - jstests/core/SERVER-23626.js + # These tests write with {w: 0} which doesn't wait for the storage transaction writing the + # document and the oplog entry to commit so the TenantMigrationConflict will not be caught. + - jstests/core/batch_write_command_w0.js + - jstests/core/crud_api.js + # These tests use benchRun which does not use runCommand. + - jstests/core/bench_test1.js + - jstests/core/bench_test3.js + - jstests/core/benchrun_pipeline_updates.js + # This test uses exhaust which does not use runCommand. + - jstests/core/exhaust.js + # These tests use db._authOrThrow which does not use runCommand. + - jstests/core/auth1.js + - jstests/core/connection_status.js + - jstests/core/user_management_helpers.js + # These tests use legacy read mode which does not use runCommand. + - jstests/core/comment_field.js + - jstests/core/invalidated_legacy_cursors.js + # TODO (SERVER-52727): Synchronize cloneCollectionAsCapped with tenant migrations. + - jstests/core/capped_convertToCapped1.js + # TODO (SERVER-52866): Synchronize getLastError with tenant migrations. + - jstests/core/bulk_legacy_enforce_gle.js + # This test contains assertions for the hostname that operations run on. + - jstests/core/currentop_cursors.js + # Server parameters are stored in-memory only so are not transferred onto the recipient. These + # tests set the server parameter "notablescan" to force the node to not execute queries that + # require a collection scan and return an error. + - jstests/core/notablescan.js + - jstests/core/notablescan_capped.js + # captrunc command is not blocked during tenant migration. + - jstests/core/capped6.js + # Multi-updates that conflict with tenant migration are not retried by inject_tenant_prefix.js. + - jstests/core/batch_write_collation_estsize.js + - jstests/core/bulk_api_ordered.js + - jstests/core/bulk_api_unordered.js + - jstests/core/fts_querylang.js + - jstests/core/idhack.js + - jstests/core/role_management_helpers.js + - jstests/core/roles_info.js + - jstests/core/server1470.js + - jstests/core/update_arrayFilters.js + - jstests/core/update_arraymatch2.js + - jstests/core/update_arraymatch3.js + - jstests/core/update_arraymatch5.js + - jstests/core/update_hint.js + - jstests/core/update_multi3.js + - jstests/core/update_multi4.js + - jstests/core/update_multi5.js + - jstests/core/update_pipeline_shell_helpers.js + - jstests/core/update_with_pipeline.js + - jstests/core/update7.js + - jstests/core/updatei.js + - jstests/core/updatej.js + - jstests/core/updatel.js + - jstests/core/where_system_js.js + - jstests/core/write_result.js + + # + # Excluded from replica_sets_multi_stmt_txn_jscore_passthrough.yml + # + + ## + ## Limitations with the way the runner file injects transactions. + ## + + # These tests expects some statements to error, which will cause txns to abort entirely. + - jstests/core/capped5.js + - jstests/core/commands_with_uuid.js + - jstests/core/dbcase.js + - jstests/core/dbcase2.js + - jstests/core/explain_execution_error.js + - jstests/core/expr.js + - jstests/core/find9.js + - jstests/core/find_and_modify_invalid_query_params.js + - jstests/core/find_getmore_bsonsize.js + - jstests/core/find_getmore_cmd.js + - jstests/core/geo_allowedcomparisons.js + - jstests/core/geo_big_polygon2.js + - jstests/core/geonear_key.js + - jstests/core/get_more_cmd_refuses_api_params.js + - jstests/core/in.js + - jstests/core/index8.js # No explicit check for failed command. + - jstests/core/index_decimal.js + - jstests/core/index_multiple_compatibility.js + - jstests/core/index_partial_write_ops.js + - jstests/core/indexa.js # No explicit check for failed command. + - jstests/core/indexes_multiple_commands.js + - jstests/core/js2.js + - jstests/core/json_schema/json_schema.js + - jstests/core/mr_bigobject.js + - jstests/core/not2.js + - jstests/core/null_query_semantics.js + - jstests/core/or1.js + - jstests/core/or2.js + - jstests/core/or3.js + - jstests/core/ord.js + - jstests/core/orj.js + - jstests/core/projection_expr_mod.js + - jstests/core/ref.js + - jstests/core/ref4.js + - jstests/core/regex_limit.js + - jstests/core/remove_undefined.js + - jstests/core/set7.js + - jstests/core/sortb.js + - jstests/core/sortf.js + - jstests/core/sortg.js + - jstests/core/sortj.js + - jstests/core/sort_with_meta_operator.js + - jstests/core/tailable_skip_limit.js + - jstests/core/type_array.js + - jstests/core/uniqueness.js + - jstests/core/unset2.js + - jstests/core/update_addToSet.js + - jstests/core/update_array_offset_positional.js + - jstests/core/update_find_and_modify_id.js + - jstests/core/update_modifier_pop.js + - jstests/core/updateh.js + + # Reads from system.views. + - jstests/core/views/views_drop.js + + ## + ## Some aggregation stages don't support snapshot readconcern. + ## + + # $explain (requires read concern local) + - jstests/core/agg_hint.js + - jstests/core/and.js + - jstests/core/collation.js + - jstests/core/distinct_multikey_dotted_path.js + - jstests/core/distinct_with_hashed_index.js + - jstests/core/explain_shell_helpers.js + - jstests/core/index_partial_read_ops.js + - jstests/core/optimized_match_explain.js + - jstests/core/sort_array.js + - jstests/core/views/views_collation.js + - jstests/core/wildcard_index_count.js + - jstests/core/explain_server_params.js + + # $listSessions + - jstests/core/list_all_local_sessions.js + - jstests/core/list_all_sessions.js + - jstests/core/list_local_sessions.js + - jstests/core/list_sessions.js + + # $indexStats + - jstests/core/index_stats.js + + # $collStats + - jstests/core/views/views_coll_stats.js + + # Errors expected to happen in tests, which can cause transactions to get aborted. + # So when the test tries to inspect the documents it can be out of sync (relative + # to test run without multi statement transactions). + - jstests/core/cappeda.js + - jstests/core/doc_validation.js + - jstests/core/doc_validation_options.js + - jstests/core/field_name_validation.js + - jstests/core/insert_illegal_doc.js + - jstests/core/positional_projection.js + - jstests/core/push_sort.js + - jstests/core/rename4.js + - jstests/core/update_dbref.js + + # Trick for bypassing mongo shell validation in the test doesn't work because txn_override + # retry logic will hit the shell validation. + - jstests/core/invalid_db_name.js + + # Multiple writes in a txn, some of which fail because the collection doesn't exist. + # We create the collection and retry the last write, but previous writes would have + # still failed. + - jstests/core/dbref1.js + - jstests/core/dbref2.js + - jstests/core/ref3.js + - jstests/core/update3.js + - jstests/core/rename3.js + + ## + ## Error: Unable to acquire lock within a max lock request timeout of '0ms' milliseconds + ## + + # Collection drops done through applyOps are not converted to w:majority + - jstests/core/views/invalid_system_views.js + + ## + ## Misc. reasons. + ## + + # SERVER-34868 Cannot run a legacy query on a session. + - jstests/core/validate_cmd_ns.js + + # SERVER-34772 Tailable Cursors are not allowed with snapshot readconcern. + - jstests/core/awaitdata_getmore_cmd.js + - jstests/core/getmore_cmd_maxtimems.js + - jstests/core/tailable_cursor_invalidation.js + - jstests/core/tailable_getmore_batch_size.js + + # SERVER-34918 The "max" option of a capped collection can be exceeded until the next insert. + # The reason is that we don't update the count of a collection until a transaction commits, + # by which point it is too late to complain that "max" has been exceeded. + - jstests/core/capped_max1.js + + # The "max" option of a capped collection can be temporarily exceeded before a + # txn is committed. + - jstests/core/bulk_insert_capped.js + + # Expects collection to not have been created + - jstests/core/insert_id_undefined.js + + # Creates sessions explicitly, resulting in txns being run through different sessions + # using a single txnNumber. + - jstests/core/views/views_all_commands.js + + # Fails with implicit sessions because it will use multiple sessions on the same Mongo connection. + - jstests/core/dropdb.js + + # Committing a transaction when the server is fsync locked fails. + - jstests/core/fsync.js + + # Expects legacy errors ($err). + - jstests/core/constructors.js + + # txn interrupted by command outside of txn before getMore runs. + - jstests/core/commands_namespace_parsing.js + - jstests/core/drop3.js + - jstests/core/ensure_sorted.js + - jstests/core/geo_s2cursorlimitskip.js + - jstests/core/getmore_invalidated_cursors.js + - jstests/core/getmore_invalidated_documents.js + - jstests/core/kill_cursors.js + - jstests/core/list_collections1.js + - jstests/core/list_indexes.js + - jstests/core/list_indexes_invalidation.js + - jstests/core/list_namespaces_invalidation.js + - jstests/core/oro.js + + # Parallel Shell - we do not signal the override to end a txn when a parallel shell closes. + - jstests/core/compact_keeps_indexes.js + - jstests/core/count10.js + - jstests/core/count_plan_summary.js + - jstests/core/coveredIndex3.js + - jstests/core/crud_ops_do_not_throw_locktimeout.js + - jstests/core/distinct3.js + - jstests/core/find_and_modify_concurrent_update.js + - jstests/core/geo_update_btree.js + - jstests/core/killop_drop_collection.js + - jstests/core/loadserverscripts.js + - jstests/core/mr_killop.js + - jstests/core/remove9.js + - jstests/core/removeb.js + - jstests/core/removec.js + - jstests/core/shellstartparallel.js + - jstests/core/updatef.js + + # Command expects to see result from parallel operation. + # E.g. Suppose the following sequence of events: op1, join() op2 in parallel shell, op3. + # op3 will still be using the snapshot from op1, and not see op2 at all. + - jstests/core/bench_test1.js + - jstests/core/benchrun_pipeline_updates.js + - jstests/core/cursora.js + + # Does not support tojson of command objects. + - jstests/core/SERVER-23626.js + + # TODO(SERVER-55882): Investigate why this test is failing. + - jstests/core/wildcard_index_multikey.js + + exclude_with_any_tags: + - assumes_standalone_mongod + # These tests run getMore commands which are not supported in the tenant migration passthrough. + - requires_getmore + # Due to background tenant migrations, operations in the main test shell are not guaranteed to + # be causally consistent with operations in a parallel shell. The reason is that + # TenantMigrationCommitted error is only thrown when the client does a write or a atClusterTime/ + # afterClusterTime or linearlizable read. Therefore, one of shell may not be aware that the + # migration has occurred and would not forward the read/write command to the right replica set. + - uses_parallel_shell + # Profile settings are stored in-memory only so are not transferred to the recipient. + - requires_profiling + # emptycapped command is blocked during tenant migration. + - requires_emptycapped + - tenant_migration_incompatible + # "Cowardly refusing to override read concern of command: ..." + - assumes_read_concern_unchanged + # "writeConcern is not allowed within a multi-statement transaction" + - assumes_write_concern_unchanged + # Transactions are not allowed to operate on capped collections. + - requires_capped + # Retrying a query can change whether a plan cache entry is active. + - inspects_whether_plan_cache_entry_is_active + # $out is not supported in transactions + - uses_$out + # Transaction-continuing commands cannot specify API parameters, so tests that use API parameters + # cannot be run with transactions. + - uses_api_parameters + - does_not_support_transactions + +executor: + archive: + hooks: + - CheckReplOplogs + - CheckReplDBHash + - ValidateCollections + config: + shell_options: + eval: >- + testingReplication = true; + load('jstests/libs/override_methods/inject_tenant_prefix.js'); + load('jstests/libs/override_methods/enable_sessions.js'); + load('jstests/libs/override_methods/txn_passthrough_cmd_massage.js'); + load('jstests/libs/override_methods/network_error_and_txn_override.js'); + global_vars: + TestData: &TestData + tenantId: "tenantMigrationTenantId" + networkErrorAndTxnOverrideConfig: + wrapCRUDinTransactions: true + sessionOptions: + # Tests in this suite only read from primaries and only one node is electable, so causal + # consistency is not required to read your own writes. + causalConsistency: false + readMode: commands + hooks: + - class: ContinuousTenantMigration + shell_options: + global_vars: + TestData: *TestData + # The CheckReplDBHash hook waits until all operations have replicated to and have been applied + # on the secondaries, so we run the ValidateCollections hook after it to ensure we're + # validating the entire contents of the collection. + - class: CheckReplOplogs + - class: CheckReplDBHash + - class: ValidateCollections + - class: CleanEveryN + n: 1 + fixture: + class: TenantMigrationFixture + common_mongod_options: + set_parameters: + enableTestCommands: 1 + failpoint.abortTenantMigrationBeforeLeavingBlockingState: + mode: + activationProbability: 0.5 + failpoint.pauseTenantMigrationBeforeLeavingBlockingState: + mode: alwaysOn + data: + blockTimeMS: 250 + # To avoid routing commands in each test incorrectly, the ContinuousTenantMigration hook + # only runs donorForgetMigration against the donor of each migration when it is safe to do + # so. Therefore, the garbage collection delay doesn't need to be large. + tenantMigrationGarbageCollectionDelayMS: 1 + ttlMonitorSleepSecs: 1 + # The donor replica set may have active transactions while a migration is in progress. If + # the migration is committed, those transactions may never receive 'commitTransaction' or + # 'abortTransaction', since all writes are automatically rerouted to the recipient. We set + # a low 'transactionLifetimeLimitSeconds' value to ensure that they are cleaned up quickly. + transactionLifetimeLimitSeconds: 10 + tlsMode: allowTLS + tlsCAFile: jstests/libs/ca.pem + tlsAllowInvalidHostnames: '' + per_mongod_options: + # Each entry is for a different replica set's extra mongod options. + - tlsCertificateKeyFile: jstests/libs/rs0.pem + - tlsCertificateKeyFile: jstests/libs/rs1.pem + num_replica_sets: 2 + num_nodes_per_replica_set: 3 + use_replica_set_connection_string: true diff --git a/etc/evergreen.yml b/etc/evergreen.yml index 7c4bf03e7e4..ca87d9d847e 100644 --- a/etc/evergreen.yml +++ b/etc/evergreen.yml @@ -6305,6 +6305,16 @@ tasks: resmoke_args: --storageEngine=wiredTiger fallback_num_sub_suites: 10 +- name: tenant_migration_multi_stmt_txn_jscore_passthrough_gen + tags: ["txn"] + commands: + - func: "generate resmoke tasks" + vars: + depends_on: jsCore + use_large_distro: "true" + resmoke_args: --storageEngine=wiredTiger + fallback_num_sub_suites: 10 + - name: parallel_gen tags: ["misc_js"] commands: @@ -10336,6 +10346,7 @@ buildvariants: - name: multi_stmt_txn_jscore_passthrough_with_migration_gen - name: tenant_migration_jscore_passthrough_gen - name: tenant_migration_causally_consistent_jscore_passthrough_gen + - name: tenant_migration_multi_stmt_txn_jscore_passthrough_gen - name: multiversion_gen - name: .query_fuzzer - name: .random_multiversion_ds diff --git a/jstests/libs/override_methods/inject_tenant_prefix.js b/jstests/libs/override_methods/inject_tenant_prefix.js index f7d805b9d95..95e927162af 100644 --- a/jstests/libs/override_methods/inject_tenant_prefix.js +++ b/jstests/libs/override_methods/inject_tenant_prefix.js @@ -475,30 +475,41 @@ Mongo.prototype.runCommandRetryOnTenantMigrationErrors = function( } if (migrationCommittedErr || migrationAbortedErr) { - // Update the command for reroute/retry. - modifyCmdObjForRetry(cmdObjWithTenantId, resObj, true); - // It is safe to reformat this resObj since it will not be returned to the caller of - // runCommand. - reformatResObjForLogging(resObj); - - // Build a new indexMap where the keys are the index that each write that needs to be - // retried will have in the next attempt's cmdObj. - indexMap = resetIndices(indexMap); + // If the command was inside a transaction, skip modifying any objects or fields, since + // we will retry the entire transaction outside of this file. + if (!TransactionsUtil.isTransientTransactionError(resObj)) { + // Update the command for reroute/retry. + modifyCmdObjForRetry(cmdObjWithTenantId, resObj, true); + + // It is safe to reformat this resObj since it will not be returned to the caller of + // runCommand. + reformatResObjForLogging(resObj); + + // Build a new indexMap where the keys are the index that each write that needs to + // be retried will have in the next attempt's cmdObj. + indexMap = resetIndices(indexMap); + } if (migrationCommittedErr) { + jsTestLog(`Got TenantMigrationCommitted for command against database ${ + dbNameWithTenantId} after trying ${numAttempts} times: ${tojson(resObj)}`); // Store the connection to the recipient so the next commands can be rerouted. this.migrationStateDoc = this.getTenantMigrationStateDoc(); this.reroutingMongo = connect(this.migrationStateDoc.recipientConnectionString).getMongo(); - - jsTest.log(`Got TenantMigrationCommitted for command against database ` + - `"${dbNameWithTenantId}" after trying ${numAttempts} times, rerouting ` + - `the command: ${tojson(resObj)}`); } else if (migrationAbortedErr) { - jsTest.log( - `Got TenantMigrationAborted for command against database ` + - `"${dbNameWithTenantId}" after trying ${numAttempts} times, retrying the ` + - `command: ${tojson(resObj)}`); + jsTestLog(`Got TenantMigrationAborted for command against database ${ + dbNameWithTenantId} after trying ${numAttempts} times: ${tojson(resObj)}`); + } + + // If the result has a TransientTransactionError label, the entire transaction must be + // retried. Return immediately to let the retry be handled by + // 'network_error_and_txn_override.js'. + if (TransactionsUtil.isTransientTransactionError(resObj)) { + jsTestLog(`Got error for transaction against database ` + + `${dbNameWithTenantId} with TransientTransactionError, retrying ` + + `transaction against recipient: ${tojson(resObj)}`); + return resObj; } } else { // Modify the resObj before returning the result. diff --git a/src/mongo/db/repl/tenant_migration_recipient_service.cpp b/src/mongo/db/repl/tenant_migration_recipient_service.cpp index 236fc2ee5f8..b56dc8bd1b5 100644 --- a/src/mongo/db/repl/tenant_migration_recipient_service.cpp +++ b/src/mongo/db/repl/tenant_migration_recipient_service.cpp @@ -932,7 +932,8 @@ void TenantMigrationRecipientService::Instance::_processCommittedTransactionEntr "sessionId"_attr = sessionId, "txnNumber"_attr = txnNumber, "tenantId"_attr = getTenantId(), - "migrationId"_attr = getMigrationUUID()); + "migrationId"_attr = getMigrationUUID(), + "entry"_attr = entry.toString()); auto txnParticipant = TransactionParticipant::get(opCtx); uassert(5351300, |