diff options
author | Luis Osta <luis.osta@mongodb.com> | 2021-12-07 18:00:21 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-12-08 14:51:02 +0000 |
commit | f33c1dac76e2799d50b3453eaf14d771dc9646ab (patch) | |
tree | be81a65c6dc6f0462413134bf819cf5aac5b30c2 | |
parent | 79599d1ea413cfc331d8b48ac617dec08bdcba0f (diff) | |
download | mongo-f33c1dac76e2799d50b3453eaf14d771dc9646ab.tar.gz |
SERVER-61816 Add steps past kWaitingForVotes to assert.soon
(cherry picked from commit 6d1b572c7ddbba652ffa49dc3783fbd27cec9714)
SERVER-60685 Add Interruption category to 'TransactionCoordinatorReachedAbortDecision'
(cherry picked from commit 78ab98a46b53582a5e69424bbb92f25c483fec0a)
(cherry picked from commit 7634ffa5d056aa5efcc12079d00da898e6f258fb)
-rw-r--r-- | etc/backports_required_for_multiversion_tests.yml | 4 | ||||
-rw-r--r-- | jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js | 135 | ||||
-rw-r--r-- | src/mongo/base/error_codes.yml | 2 |
3 files changed, 140 insertions, 1 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index 79c033c8b69..a1f5ddacaa6 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -72,6 +72,8 @@ last-continuous: test_file: jstests/sharding/resharding_secondary_recovers_temp_ns_metadata.js - ticket: SERVER-60682 test_file: jstests/sharding/coordinate_txn_commit_with_tickets_exhausted.js + - ticket: SERVER-60685 + test_file: jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js # Tests that should only be excluded from particular suites should be listed under that suite. suites: @@ -296,6 +298,8 @@ last-lts: test_file: jstests/sharding/resharding_secondary_recovers_temp_ns_metadata.js - ticket: SERVER-60682 test_file: jstests/sharding/coordinate_txn_commit_with_tickets_exhausted.js + - ticket: SERVER-60685 + test_file: jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js # Tests that should only be excluded from particular suites should be listed under that suite. suites: diff --git a/jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js b/jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js new file mode 100644 index 00000000000..a905759b2a8 --- /dev/null +++ b/jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js @@ -0,0 +1,135 @@ +/** + * The test checks that the TransactionCoordinator will not crash if the transaction is aborted when + * attempting to commit a transaction. + * + * Step 1. Run and commit a transaction in order to initialize TransactionCoordinator. + * + * Step 2. Run `kNumWriteTickets` remove operations in parallel. So that they take up all of the + * WiredTiger tickets. + * + * Step 3. Run a transaction in parallel, but do not attempt to commit it until + * all of the remove operations have taken WiredTiger tickets. Step 4. Wait for the transaction to + * reach the `deletingCoordinatorDoc` state. + * + * Step 5. Turn off the `hangWithLockDuringBatchRemoveFp` + * and join the parallel remove operations and transaction thread. + */ + +(function() { +"use strict"; + +load("jstests/libs/fail_point_util.js"); +load('jstests/libs/parallelTester.js'); +load("jstests/sharding/libs/create_sharded_collection_util.js"); + +const kNumWriteTickets = 10; +const st = new ShardingTest({ + mongos: 1, + config: 1, + shards: 2, + rs: {nodes: 1}, + rsOptions: { + setParameter: { + wiredTigerConcurrentWriteTransactions: kNumWriteTickets, + // Lower transactionLifetimeLimitSeconds to cause TransactionCoordinators which haven't + // yet made their commit or abort decision to time out and abort the transaction. + transactionLifetimeLimitSeconds: 20, + } + } +}); + +const dbName = "test"; +const collName = "mycoll"; +const failpointName = 'hangWithLockDuringBatchRemove'; +const sourceCollection = st.s.getDB(dbName).getCollection(collName); +const txnCoordinator = st.rs1.getPrimary(); +const insertLatch = new CountDownLatch(1); + +CreateShardedCollectionUtil.shardCollectionWithChunks(sourceCollection, {key: 1}, [ + {min: {key: MinKey}, max: {key: 0}, shard: st.shard0.shardName}, + {min: {key: 0}, max: {key: MaxKey}, shard: st.shard1.shardName}, +]); + +const removeOperationThreads = Array.from({length: kNumWriteTickets}).map(() => { + return new Thread(function removeOperation(host, dbName, collName, insertLatch) { + const conn = new Mongo(host); + const testDB = conn.getDB(dbName); + const coll = testDB.getCollection(collName); + insertLatch.await(); + assert.commandWorked(coll.remove({key: 200}, {justOne: true})); + }, st.s.host, dbName, collName, insertLatch); +}); + +const session = st.s.startSession({causalConsistency: false}); +const sessionCollection = session.getDatabase(dbName).getCollection(collName); + +// A two-phase commit transaction is first run to ensure the TransactionCoordinator has recovered +// and persisted a topology time. The transactionThread will run a second two-phase commit +// transaction using the same shard for coordinating the transaction. This ensures the +// transactionThread won't need to persist a topology time. The scenario reported in SERVER-60685 +// depended on the TransactionCoordinator being interrupted while persisting the participant list +// which happens after waiting for the topology time to become durable. +session.startTransaction(); +assert.commandWorked(sessionCollection.insert({key: 400})); +assert.commandWorked(sessionCollection.insert({key: -400})); +assert.commandWorked(session.commitTransaction_forTesting()); + +const hangWithLockDuringBatchRemoveFp = configureFailPoint(txnCoordinator, failpointName); + +const transactionThread = new Thread( + function runTwoPhaseCommitTxnAndTimeOutBeforeWritingParticipantList( + host, dbName, collName, failpointName, totalWriteTickets, insertLatch) { + const conn = new Mongo(host); + const session = conn.startSession({causalConsistency: false}); + const sessionCollection = session.getDatabase(dbName).getCollection(collName); + session.startTransaction(); + assert.commandWorked(sessionCollection.insert({key: 400})); + assert.commandWorked(sessionCollection.insert({key: -400})); + insertLatch.countDown(); + + const currentOp = (pipeline = []) => + conn.getDB("admin").aggregate([{$currentOp: {}}, ...pipeline]).toArray(); + assert.soon(() => { + const removeOperations = currentOp([ + {$match: {op: "remove", failpointMsg: failpointName, ns: `${dbName}.${collName}`}} + ]); + return removeOperations.length === totalWriteTickets; + }, () => `Timed out waiting for the remove operations: ${tojson(currentOp())}`); + + // After here all of the WiredTiger write tickets should be taken. + assert.commandFailedWithCode(session.commitTransaction_forTesting(), + ErrorCodes.NoSuchTransaction); + }, + st.s.host, + dbName, + collName, + failpointName, + kNumWriteTickets, + insertLatch); + +transactionThread.start(); + +removeOperationThreads.forEach(thread => thread.start()); + +let twoPhaseCommitCoordinatorServerStatus; +assert.soon( + () => { + twoPhaseCommitCoordinatorServerStatus = + txnCoordinator.getDB(dbName).serverStatus().twoPhaseCommitCoordinator; + const {deletingCoordinatorDoc, waitingForDecisionAcks, writingDecision} = + twoPhaseCommitCoordinatorServerStatus.currentInSteps; + return deletingCoordinatorDoc.toNumber() === 1 || waitingForDecisionAcks.toNumber() === 1 || + writingDecision.toNumber() === 1; + }, + () => `Failed to find 1 total transactions in a state past kWaitingForVotes: ${ + tojson(twoPhaseCommitCoordinatorServerStatus)}`); + +hangWithLockDuringBatchRemoveFp.off(); + +transactionThread.join(); +removeOperationThreads.forEach((thread) => { + thread.join(); +}); + +st.stop(); +})(); diff --git a/src/mongo/base/error_codes.yml b/src/mongo/base/error_codes.yml index 183891d5d5a..d36f97ca851 100644 --- a/src/mongo/base/error_codes.yml +++ b/src/mongo/base/error_codes.yml @@ -329,7 +329,7 @@ error_codes: # TransactionCoordinatorSteppingDown gets converted to InterruptedDueToReplStateChange - {code: 281,name: TransactionCoordinatorSteppingDown,categories: [Interruption,InternalOnly]} - - {code: 282,name: TransactionCoordinatorReachedAbortDecision,categories: [InternalOnly]} + - {code: 282,name: TransactionCoordinatorReachedAbortDecision,categories: [Interruption,InternalOnly]} - {code: 283,name: WouldChangeOwningShard,extra: WouldChangeOwningShardInfo} - {code: 284,name: ForTestingErrorExtraInfoWithExtraInfoInNamespace, |