summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuis Osta <luis.osta@mongodb.com>2022-01-10 21:56:35 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-01-10 23:08:49 +0000
commitf2fc34bdec288cf3b90bf926d6a6c77631f4fa10 (patch)
treede8cce6e19d5d879b0b6982ad3d8c6d6bdd7038b
parentebed720bc2c1037b658bbbe027fbb38965babc2f (diff)
downloadmongo-f2fc34bdec288cf3b90bf926d6a6c77631f4fa10.tar.gz
SERVER-61816 Add steps past kWaitingForVotes to assert.soon
(cherry picked from commit 6d1b572c7ddbba652ffa49dc3783fbd27cec9714) SERVER-60685 Add Interruption category to 'TransactionCoordinatorReachedAbortDecision' (cherry picked from commit 78ab98a46b53582a5e69424bbb92f25c483fec0a)
-rw-r--r--buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml1
-rw-r--r--etc/backports_required_for_multiversion_tests.yml2
-rw-r--r--jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js124
-rw-r--r--src/mongo/base/error_codes.err3
4 files changed, 129 insertions, 1 deletions
diff --git a/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml b/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml
index fab6f7fda0c..29aadda4bc7 100644
--- a/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml
+++ b/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml
@@ -45,6 +45,7 @@ selector:
- jstests/sharding/change_stream_show_migration_events.js
- jstests/sharding/prepare_transaction_then_migrate.js
- jstests/sharding/coordinate_txn_commit_with_tickets_exhausted.js
+ - jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js
# Enable after SERVER-40258 gets backported and available in the official 4.2 binaries.
- jstests/sharding/prepared_txn_metadata_refresh.js
# Enable after SERVER-38691 gets backported to 4.2 and becomes the last stable.
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index e430de192fd..dd877b77bd3 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -124,6 +124,8 @@ all:
test_file: jstests/sharding/collation_shard_targeting_hashed_shard_key.js
- ticket: SERVER-60682
test_file: jstests/sharding/coordinate_txn_commit_with_tickets_exhausted.js
+ - ticket: SERVER-60685
+ test_file: jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js
# Tests that should only be excluded from particular suites should be listed under that suite.
suites:
diff --git a/jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js b/jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js
new file mode 100644
index 00000000000..a555fb0cc95
--- /dev/null
+++ b/jstests/sharding/cancel_coordinate_txn_commit_with_tickets_exhausted.js
@@ -0,0 +1,124 @@
+/**
+ * The test checks that the TransactionCoordinator will not crash if the transaction is aborted when
+ * attempting to commit a transaction.
+ *
+ * Step 1. Run and commit a transaction in order to initialize TransactionCoordinator.
+ *
+ * Step 2. Run `kNumWriteTickets` remove operations in parallel. So that they take up all of the
+ * WiredTiger tickets.
+ *
+ * Step 3. Run a transaction in parallel, but do not attempt to commit it until
+ * all of the remove operations have taken WiredTiger tickets. Step 4. Wait for the transaction to
+ * reach the `deletingCoordinatorDoc` state.
+ *
+ * Step 5. Turn off the `hangWithLockDuringBatchRemoveFp`
+ * and join the parallel remove operations and transaction thread.
+ *
+ * @tags: [uses_multi_shard_transaction]
+ */
+
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+load('jstests/libs/parallelTester.js');
+load("jstests/sharding/libs/create_sharded_collection_util.js");
+
+const kNumWriteTickets = 10;
+const st = new ShardingTest({
+ mongos: 1,
+ config: 1,
+ shards: 2,
+ rs: {nodes: 1},
+ rsOptions: {
+ setParameter: {
+ wiredTigerConcurrentWriteTransactions: kNumWriteTickets,
+ // Lower transactionLifetimeLimitSeconds to cause TransactionCoordinators which haven't
+ // yet made their commit or abort decision to time out and abort the transaction.
+ transactionLifetimeLimitSeconds: 20,
+ }
+ }
+});
+
+const dbName = "test";
+const collName = "mycoll";
+const failpointName = 'hangWithLockDuringBatchRemove';
+const sourceCollection = st.s.getDB(dbName).getCollection(collName);
+const txnCoordinator = st.rs1.getPrimary();
+const insertLatch = new CountDownLatch(1);
+
+CreateShardedCollectionUtil.shardCollectionWithChunks(sourceCollection, {key: 1}, [
+ {min: {key: MinKey}, max: {key: 0}, shard: st.shard0.shardName},
+ {min: {key: 0}, max: {key: MaxKey}, shard: st.shard1.shardName},
+]);
+
+const removeOperationThreads = Array.from({length: kNumWriteTickets}).map(() => {
+ return new Thread(function removeOperation(host, dbName, collName, insertLatch) {
+ const conn = new Mongo(host);
+ const testDB = conn.getDB(dbName);
+ const coll = testDB.getCollection(collName);
+ insertLatch.await();
+ assert.commandWorked(coll.remove({key: 200}, {justOne: true}));
+ }, st.s.host, dbName, collName, insertLatch);
+});
+
+const session = st.s.startSession({causalConsistency: false});
+const sessionCollection = session.getDatabase(dbName).getCollection(collName);
+
+// A two-phase commit transaction is first run to ensure the TransactionCoordinator has recovered
+// and persisted a topology time. The transactionThread will run a second two-phase commit
+// transaction using the same shard for coordinating the transaction. This ensures the
+// transactionThread won't need to persist a topology time. The scenario reported in SERVER-60685
+// depended on the TransactionCoordinator being interrupted while persisting the participant list
+// which happens after waiting for the topology time to become durable.
+session.startTransaction();
+assert.commandWorked(sessionCollection.insert({key: 400}));
+assert.commandWorked(sessionCollection.insert({key: -400}));
+assert.commandWorked(session.commitTransaction_forTesting());
+
+const hangWithLockDuringBatchRemoveFp = configureFailPoint(txnCoordinator, failpointName);
+
+const transactionThread = new Thread(
+ function runTwoPhaseCommitTxnAndTimeOutBeforeWritingParticipantList(
+ host, dbName, collName, failpointName, totalWriteTickets, insertLatch) {
+ const conn = new Mongo(host);
+ const session = conn.startSession({causalConsistency: false});
+ const sessionCollection = session.getDatabase(dbName).getCollection(collName);
+ session.startTransaction();
+ assert.commandWorked(sessionCollection.insert({key: 400}));
+ assert.commandWorked(sessionCollection.insert({key: -400}));
+ insertLatch.countDown();
+
+ const currentOp = (pipeline = []) =>
+ conn.getDB("admin").aggregate([{$currentOp: {}}, ...pipeline]).toArray();
+ assert.soon(() => {
+ const removeOperations = currentOp(
+ [{$match: {op: "remove", msg: failpointName, ns: `${dbName}.${collName}`}}]);
+ return removeOperations.length === totalWriteTickets;
+ }, () => `Timed out waiting for the remove operations: ${tojson(currentOp())}`);
+
+ // After here all of the WiredTiger write tickets should be taken.
+ assert.commandFailedWithCode(
+ session.commitTransaction_forTesting(),
+ [ErrorCodes.TransactionCoordinatorReachedAbortDecision, ErrorCodes.NoSuchTransaction]);
+ },
+ st.s.host,
+ dbName,
+ collName,
+ failpointName,
+ kNumWriteTickets,
+ insertLatch);
+
+transactionThread.start();
+
+removeOperationThreads.forEach(thread => thread.start());
+
+transactionThread.join();
+hangWithLockDuringBatchRemoveFp.off();
+
+removeOperationThreads.forEach((thread) => {
+ thread.join();
+});
+
+st.stop();
+})();
diff --git a/src/mongo/base/error_codes.err b/src/mongo/base/error_codes.err
index 5ca821047f3..9a671903839 100644
--- a/src/mongo/base/error_codes.err
+++ b/src/mongo/base/error_codes.err
@@ -332,7 +332,8 @@ error_class("Interruption", ["Interrupted",
"CursorKilled",
"LockTimeout",
"ClientDisconnect",
- "ClientMarkedKilled"])
+ "ClientMarkedKilled",
+ "TransactionCoordinatorReachedAbortDecision"])
# isNotPrimaryError() includes all codes that indicate that the node that received the request was
# not primary at some point during command processing, regardless of whether some write may have