diff options
author | Luis Osta <luis.osta@mongodb.com> | 2022-01-10 21:56:35 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-01-10 23:08:49 +0000 |
commit | f2fc34bdec288cf3b90bf926d6a6c77631f4fa10 (patch) | |
tree | de8cce6e19d5d879b0b6982ad3d8c6d6bdd7038b | |
parent | ebed720bc2c1037b658bbbe027fbb38965babc2f (diff) | |
download | mongo-f2fc34bdec288cf3b90bf926d6a6c77631f4fa10.tar.gz |
SERVER-61816 Add steps past kWaitingForVotes to assert.soon
(cherry picked from commit 6d1b572c7ddbba652ffa49dc3783fbd27cec9714)
SERVER-60685 Add Interruption category to 'TransactionCoordinatorReachedAbortDecision'
(cherry picked from commit 78ab98a46b53582a5e69424bbb92f25c483fec0a)
4 files changed, 129 insertions, 1 deletions
diff --git a/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml b/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml index fab6f7fda0c..29aadda4bc7 100644 --- a/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml +++ b/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml @@ -45,6 +45,7 @@ selector: - jstests/sharding/change_stream_show_migration_events.js - jstests/sharding/prepare_transaction_then_migrate.js - jstests/sharding/coordinate_txn_commit_with_tickets_exhausted.js + - jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js # Enable after SERVER-40258 gets backported and available in the official 4.2 binaries. - jstests/sharding/prepared_txn_metadata_refresh.js # Enable after SERVER-38691 gets backported to 4.2 and becomes the last stable. diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index e430de192fd..dd877b77bd3 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -124,6 +124,8 @@ all: test_file: jstests/sharding/collation_shard_targeting_hashed_shard_key.js - ticket: SERVER-60682 test_file: jstests/sharding/coordinate_txn_commit_with_tickets_exhausted.js + - ticket: SERVER-60685 + test_file: jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js # Tests that should only be excluded from particular suites should be listed under that suite. suites: diff --git a/jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js b/jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js new file mode 100644 index 00000000000..a555fb0cc95 --- /dev/null +++ b/jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js @@ -0,0 +1,124 @@ +/** + * The test checks that the TransactionCoordinator will not crash if the transaction is aborted when + * attempting to commit a transaction. + * + * Step 1. Run and commit a transaction in order to initialize TransactionCoordinator. + * + * Step 2. Run `kNumWriteTickets` remove operations in parallel. So that they take up all of the + * WiredTiger tickets. + * + * Step 3. Run a transaction in parallel, but do not attempt to commit it until + * all of the remove operations have taken WiredTiger tickets. Step 4. Wait for the transaction to + * reach the `deletingCoordinatorDoc` state. + * + * Step 5. Turn off the `hangWithLockDuringBatchRemoveFp` + * and join the parallel remove operations and transaction thread. + * + * @tags: [uses_multi_shard_transaction] + */ + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load('jstests/libs/parallelTester.js'); +load("jstests/sharding/libs/create_sharded_collection_util.js"); + +const kNumWriteTickets = 10; +const st = new ShardingTest({ + mongos: 1, + config: 1, + shards: 2, + rs: {nodes: 1}, + rsOptions: { + setParameter: { + wiredTigerConcurrentWriteTransactions: kNumWriteTickets, + // Lower transactionLifetimeLimitSeconds to cause TransactionCoordinators which haven't + // yet made their commit or abort decision to time out and abort the transaction. + transactionLifetimeLimitSeconds: 20, + } + } +}); + +const dbName = "test"; +const collName = "mycoll"; +const failpointName = 'hangWithLockDuringBatchRemove'; +const sourceCollection = st.s.getDB(dbName).getCollection(collName); +const txnCoordinator = st.rs1.getPrimary(); +const insertLatch = new CountDownLatch(1); + +CreateShardedCollectionUtil.shardCollectionWithChunks(sourceCollection, {key: 1}, [ + {min: {key: MinKey}, max: {key: 0}, shard: st.shard0.shardName}, + {min: {key: 0}, max: {key: MaxKey}, shard: st.shard1.shardName}, +]); + +const removeOperationThreads = Array.from({length: kNumWriteTickets}).map(() => { + return new Thread(function removeOperation(host, dbName, collName, insertLatch) { + const conn = new Mongo(host); + const testDB = conn.getDB(dbName); + const coll = testDB.getCollection(collName); + insertLatch.await(); + assert.commandWorked(coll.remove({key: 200}, {justOne: true})); + }, st.s.host, dbName, collName, insertLatch); +}); + +const session = st.s.startSession({causalConsistency: false}); +const sessionCollection = session.getDatabase(dbName).getCollection(collName); + +// A two-phase commit transaction is first run to ensure the TransactionCoordinator has recovered +// and persisted a topology time. The transactionThread will run a second two-phase commit +// transaction using the same shard for coordinating the transaction. This ensures the +// transactionThread won't need to persist a topology time. The scenario reported in SERVER-60685 +// depended on the TransactionCoordinator being interrupted while persisting the participant list +// which happens after waiting for the topology time to become durable. +session.startTransaction(); +assert.commandWorked(sessionCollection.insert({key: 400})); +assert.commandWorked(sessionCollection.insert({key: -400})); +assert.commandWorked(session.commitTransaction_forTesting()); + +const hangWithLockDuringBatchRemoveFp = configureFailPoint(txnCoordinator, failpointName); + +const transactionThread = new Thread( + function runTwoPhaseCommitTxnAndTimeOutBeforeWritingParticipantList( + host, dbName, collName, failpointName, totalWriteTickets, insertLatch) { + const conn = new Mongo(host); + const session = conn.startSession({causalConsistency: false}); + const sessionCollection = session.getDatabase(dbName).getCollection(collName); + session.startTransaction(); + assert.commandWorked(sessionCollection.insert({key: 400})); + assert.commandWorked(sessionCollection.insert({key: -400})); + insertLatch.countDown(); + + const currentOp = (pipeline = []) => + conn.getDB("admin").aggregate([{$currentOp: {}}, ...pipeline]).toArray(); + assert.soon(() => { + const removeOperations = currentOp( + [{$match: {op: "remove", msg: failpointName, ns: `${dbName}.${collName}`}}]); + return removeOperations.length === totalWriteTickets; + }, () => `Timed out waiting for the remove operations: ${tojson(currentOp())}`); + + // After here all of the WiredTiger write tickets should be taken. + assert.commandFailedWithCode( + session.commitTransaction_forTesting(), + [ErrorCodes.TransactionCoordinatorReachedAbortDecision, ErrorCodes.NoSuchTransaction]); + }, + st.s.host, + dbName, + collName, + failpointName, + kNumWriteTickets, + insertLatch); + +transactionThread.start(); + +removeOperationThreads.forEach(thread => thread.start()); + +transactionThread.join(); +hangWithLockDuringBatchRemoveFp.off(); + +removeOperationThreads.forEach((thread) => { + thread.join(); +}); + +st.stop(); +})(); diff --git a/src/mongo/base/error_codes.err b/src/mongo/base/error_codes.err index 5ca821047f3..9a671903839 100644 --- a/src/mongo/base/error_codes.err +++ b/src/mongo/base/error_codes.err @@ -332,7 +332,8 @@ error_class("Interruption", ["Interrupted", "CursorKilled", "LockTimeout", "ClientDisconnect", - "ClientMarkedKilled"]) + "ClientMarkedKilled", + "TransactionCoordinatorReachedAbortDecision"]) # isNotPrimaryError() includes all codes that indicate that the node that received the request was # not primary at some point during command processing, regardless of whether some write may have |