summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--etc/backports_required_for_multiversion_tests.yml4
-rw-r--r--jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js135
-rw-r--r--src/mongo/base/error_codes.yml2
3 files changed, 140 insertions, 1 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index 27e86c55998..9cd646cb39d 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -114,6 +114,8 @@ last-continuous:
test_file: jstests/replsets/replSetGetStatus_member_wall_times.js
- ticket: SERVER-60682
test_file: jstests/sharding/coordinate_txn_commit_with_tickets_exhausted.js
+ - ticket: SERVER-60685
+ test_file: jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js
# Tests that should only be excluded from particular suites should be listed under that suite.
suites:
@@ -388,6 +390,8 @@ last-lts:
test_file: jstests/replsets/replSetGetStatus_member_wall_times.js
- ticket: SERVER-60682
test_file: jstests/sharding/coordinate_txn_commit_with_tickets_exhausted.js
+ - ticket: SERVER-60685
+ test_file: jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js
# Tests that should only be excluded from particular suites should be listed under that suite.
suites:
diff --git a/jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js b/jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js
new file mode 100644
index 00000000000..a905759b2a8
--- /dev/null
+++ b/jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js
@@ -0,0 +1,135 @@
+/**
+ * The test checks that the TransactionCoordinator will not crash if the transaction is aborted when
+ * attempting to commit a transaction.
+ *
+ * Step 1. Run and commit a transaction in order to initialize TransactionCoordinator.
+ *
+ * Step 2. Run `kNumWriteTickets` remove operations in parallel. So that they take up all of the
+ * WiredTiger tickets.
+ *
+ * Step 3. Run a transaction in parallel, but do not attempt to commit it until
+ * all of the remove operations have taken WiredTiger tickets. Step 4. Wait for the transaction to
+ * reach the `deletingCoordinatorDoc` state.
+ *
+ * Step 5. Turn off the `hangWithLockDuringBatchRemoveFp`
+ * and join the parallel remove operations and transaction thread.
+ */
+
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+load('jstests/libs/parallelTester.js');
+load("jstests/sharding/libs/create_sharded_collection_util.js");
+
+const kNumWriteTickets = 10;
+const st = new ShardingTest({
+ mongos: 1,
+ config: 1,
+ shards: 2,
+ rs: {nodes: 1},
+ rsOptions: {
+ setParameter: {
+ wiredTigerConcurrentWriteTransactions: kNumWriteTickets,
+ // Lower transactionLifetimeLimitSeconds to cause TransactionCoordinators which haven't
+ // yet made their commit or abort decision to time out and abort the transaction.
+ transactionLifetimeLimitSeconds: 20,
+ }
+ }
+});
+
+const dbName = "test";
+const collName = "mycoll";
+const failpointName = 'hangWithLockDuringBatchRemove';
+const sourceCollection = st.s.getDB(dbName).getCollection(collName);
+const txnCoordinator = st.rs1.getPrimary();
+const insertLatch = new CountDownLatch(1);
+
+CreateShardedCollectionUtil.shardCollectionWithChunks(sourceCollection, {key: 1}, [
+ {min: {key: MinKey}, max: {key: 0}, shard: st.shard0.shardName},
+ {min: {key: 0}, max: {key: MaxKey}, shard: st.shard1.shardName},
+]);
+
+const removeOperationThreads = Array.from({length: kNumWriteTickets}).map(() => {
+ return new Thread(function removeOperation(host, dbName, collName, insertLatch) {
+ const conn = new Mongo(host);
+ const testDB = conn.getDB(dbName);
+ const coll = testDB.getCollection(collName);
+ insertLatch.await();
+ assert.commandWorked(coll.remove({key: 200}, {justOne: true}));
+ }, st.s.host, dbName, collName, insertLatch);
+});
+
+const session = st.s.startSession({causalConsistency: false});
+const sessionCollection = session.getDatabase(dbName).getCollection(collName);
+
+// A two-phase commit transaction is first run to ensure the TransactionCoordinator has recovered
+// and persisted a topology time. The transactionThread will run a second two-phase commit
+// transaction using the same shard for coordinating the transaction. This ensures the
+// transactionThread won't need to persist a topology time. The scenario reported in SERVER-60685
+// depended on the TransactionCoordinator being interrupted while persisting the participant list
+// which happens after waiting for the topology time to become durable.
+session.startTransaction();
+assert.commandWorked(sessionCollection.insert({key: 400}));
+assert.commandWorked(sessionCollection.insert({key: -400}));
+assert.commandWorked(session.commitTransaction_forTesting());
+
+const hangWithLockDuringBatchRemoveFp = configureFailPoint(txnCoordinator, failpointName);
+
+const transactionThread = new Thread(
+ function runTwoPhaseCommitTxnAndTimeOutBeforeWritingParticipantList(
+ host, dbName, collName, failpointName, totalWriteTickets, insertLatch) {
+ const conn = new Mongo(host);
+ const session = conn.startSession({causalConsistency: false});
+ const sessionCollection = session.getDatabase(dbName).getCollection(collName);
+ session.startTransaction();
+ assert.commandWorked(sessionCollection.insert({key: 400}));
+ assert.commandWorked(sessionCollection.insert({key: -400}));
+ insertLatch.countDown();
+
+ const currentOp = (pipeline = []) =>
+ conn.getDB("admin").aggregate([{$currentOp: {}}, ...pipeline]).toArray();
+ assert.soon(() => {
+ const removeOperations = currentOp([
+ {$match: {op: "remove", failpointMsg: failpointName, ns: `${dbName}.${collName}`}}
+ ]);
+ return removeOperations.length === totalWriteTickets;
+ }, () => `Timed out waiting for the remove operations: ${tojson(currentOp())}`);
+
+ // After here all of the WiredTiger write tickets should be taken.
+ assert.commandFailedWithCode(session.commitTransaction_forTesting(),
+ ErrorCodes.NoSuchTransaction);
+ },
+ st.s.host,
+ dbName,
+ collName,
+ failpointName,
+ kNumWriteTickets,
+ insertLatch);
+
+transactionThread.start();
+
+removeOperationThreads.forEach(thread => thread.start());
+
+let twoPhaseCommitCoordinatorServerStatus;
+assert.soon(
+ () => {
+ twoPhaseCommitCoordinatorServerStatus =
+ txnCoordinator.getDB(dbName).serverStatus().twoPhaseCommitCoordinator;
+ const {deletingCoordinatorDoc, waitingForDecisionAcks, writingDecision} =
+ twoPhaseCommitCoordinatorServerStatus.currentInSteps;
+ return deletingCoordinatorDoc.toNumber() === 1 || waitingForDecisionAcks.toNumber() === 1 ||
+ writingDecision.toNumber() === 1;
+ },
+ () => `Failed to find 1 total transactions in a state past kWaitingForVotes: ${
+ tojson(twoPhaseCommitCoordinatorServerStatus)}`);
+
+hangWithLockDuringBatchRemoveFp.off();
+
+transactionThread.join();
+removeOperationThreads.forEach((thread) => {
+ thread.join();
+});
+
+st.stop();
+})();
diff --git a/src/mongo/base/error_codes.yml b/src/mongo/base/error_codes.yml
index 420b78dc8ff..8a2805d2d4b 100644
--- a/src/mongo/base/error_codes.yml
+++ b/src/mongo/base/error_codes.yml
@@ -327,7 +327,7 @@ error_codes:
# TransactionCoordinatorSteppingDown gets converted to InterruptedDueToReplStateChange
- {code: 281,name: TransactionCoordinatorSteppingDown,categories: [Interruption,InternalOnly]}
- - {code: 282,name: TransactionCoordinatorReachedAbortDecision,categories: [InternalOnly]}
+ - {code: 282,name: TransactionCoordinatorReachedAbortDecision,categories: [Interruption,InternalOnly]}
- {code: 283,name: WouldChangeOwningShard,extra: WouldChangeOwningShardInfo}
- {code: 284,name: ForTestingErrorExtraInfoWithExtraInfoInNamespace,