SERVER-39316 Test two-phase abort path without 'config server is coordinator' override in txn_failover_two_phase_commit.js

author: Esha Maharishi <esha.maharishi@mongodb.com> 2019-01-31 15:04:42 -0500
committer: Esha Maharishi <esha.maharishi@mongodb.com> 2019-01-31 19:01:42 -0500
commit: 9606de0f0f3166b9c8fcff033f2476af2937f685 (patch)
tree: 187e4e382deaab3626fa5abb0324cf763fb75471
parent: f4656acfee11569a796e06d14e4825ab54d39ecc (diff)
download: mongo-9606de0f0f3166b9c8fcff033f2476af2937f685.tar.gz
1 files changed, 87 insertions, 67 deletions
diff --git a/jstests/sharding/txn_failover_two_phase_commit.js b/jstests/sharding/txn_failover_two_phase_commit.js
index cac9e0f1224..0fc9f1b7a2c 100644
--- a/jstests/sharding/txn_failover_two_phase_commit.js
+++ b/jstests/sharding/txn_failover_two_phase_commit.js
@@ -6,6 +6,10 @@
  * @tags: [uses_transactions, uses_multi_shard_transaction]
  */
 
+// The UUID consistency check uses connections to shards cached on the ShardingTest object, but this
+// test causes failovers on a shard, so the cached connection is not usable.
+TestData.skipCheckingUUIDsConsistentAcrossCluster = true;
+
 (function() {
     'use strict';
 
@@ -23,11 +27,10 @@
 
     let failpointCounter = 0;
 
-    const runTest = function(sameNodeStepsUpAfterFailover) {
-
-        jsTest.log("Testing all scenarios with sameNodeStepsUpAfterFailover: " +
-                   sameNodeStepsUpAfterFailover);
+    let lsid = {id: UUID()};
+    let txnNumber = 0;
 
+    const runTest = function(sameNodeStepsUpAfterFailover, overrideCoordinatorToBeConfigServer) {
         let stepDownSecs;  // The amount of time the node has to wait before becoming primary again.
         let numCoordinatorNodes;
         if (sameNodeStepsUpAfterFailover) {
@@ -38,28 +41,42 @@
             stepDownSecs = 3;
         }
 
-        let st = new ShardingTest({
-            shards: 3,                    // number of *regular shards*
-            config: numCoordinatorNodes,  // number of replica set *nodes* in *config shard*
-            causallyConsistent: true,
-            other: {
-                mongosOptions: {
-                    // This failpoint is needed because it is not yet possible to step down a node
-                    // with a prepared transaction.
-                    setParameter:
-                        {"failpoint.sendCoordinateCommitToConfigServer": "{'mode': 'alwaysOn'}"},
-                    verbose: 3
-                },
-                configOptions: {
-                    // This failpoint is needed because of the other failpoint: the config server
-                    // will not have a local participant, so coordinateCommitTransaction cannot fall
-                    // back to recovering the decision from the local participant.
-                    setParameter: {"failpoint.doNotForgetCoordinator": "{'mode': 'alwaysOn'}"},
+        let st, coordinatorReplSetTest;
+        if (overrideCoordinatorToBeConfigServer) {
+            st = new ShardingTest({
+                shards: 3,                    // number of *regular shards*
+                config: numCoordinatorNodes,  // number of replica set *nodes* in *config shard*
+                causallyConsistent: true,
+                other: {
+                    mongosOptions: {
+                        // This failpoint is needed because it is not yet possible to step down a
+                        // node with a prepared transaction.
+                        setParameter: {
+                            "failpoint.sendCoordinateCommitToConfigServer": "{'mode': 'alwaysOn'}"
+                        },
+                        verbose: 3
+                    },
+                    configOptions: {
+                        // This failpoint is needed because of the other failpoint: the config
+                        // server will not have a local participant, so coordinateCommitTransaction
+                        // cannot fall back to recovering the decision from the local participant.
+                        setParameter: {"failpoint.doNotForgetCoordinator": "{'mode': 'alwaysOn'}"},
+                    }
                 }
-            }
-        });
+            });
+
+            coordinatorReplSetTest = st.configRS;
+        } else {
+            st = new ShardingTest({
+                shards: 3,
+                rs0: {nodes: numCoordinatorNodes},
+                causallyConsistent: true,
+                other: {mongosOptions: {verbose: 3}}
+            });
+
+            coordinatorReplSetTest = st.rs0;
+        }
 
-        let coordinatorReplSetTest = st.configRS;
         let participant0 = st.shard0;
         let participant1 = st.shard1;
         let participant2 = st.shard2;
@@ -67,9 +84,6 @@
         let expectedParticipantList =
             [participant0.shardName, participant1.shardName, participant2.shardName];
 
-        let lsid = {id: UUID()};
-        let txnNumber = 0;
-
         const runCommitThroughMongosInParallelShellExpectSuccess = function() {
             const runCommitExpectSuccessCode = "assert.commandWorked(db.adminCommand({" +
                 "commitTransaction: 1," + "lsid: " + tojson(lsid) + "," + "txnNumber: NumberLong(" +
@@ -101,13 +115,6 @@
             assert.commandWorked(
                 st.s.adminCommand({moveChunk: ns, find: {_id: 10}, to: participant2.shardName}));
 
-            // These forced refreshes are not strictly necessary; they just prevent extra TXN log
-            // lines from the shards starting, aborting, and restarting the transaction due to
-            // needing to refresh after the transaction has started.
-            assert.commandWorked(participant0.adminCommand({_flushRoutingTableCacheUpdates: ns}));
-            assert.commandWorked(participant1.adminCommand({_flushRoutingTableCacheUpdates: ns}));
-            assert.commandWorked(participant2.adminCommand({_flushRoutingTableCacheUpdates: ns}));
-
             // Start a new transaction by inserting a document onto each shard.
             assert.commandWorked(st.s.getDB(dbName).runCommand({
                 insert: collName,
@@ -121,17 +128,27 @@
         };
 
         const testCommitProtocol = function(makeAParticipantAbort, failpoint, expectAbortResponse) {
-            jsTest.log("Testing commit protocol with makeAParticipantAbort: " +
-                       makeAParticipantAbort + ", failpoint: " + failpoint +
-                       ", and expectAbortResponse: " + expectAbortResponse);
+            jsTest.log("Testing commit protocol with sameNodeStepsUpAfterFailover: " +
+                       sameNodeStepsUpAfterFailover + ", overrideCoordinatorToBeConfigServer: " +
+                       overrideCoordinatorToBeConfigServer + ", makeAParticipantAbort: " +
+                       makeAParticipantAbort + ", expectAbortResponse: " + expectAbortResponse +
+                       ", and failpoint: " + failpoint);
 
             txnNumber++;
             setUp();
 
+            coordinatorReplSetTest.awaitNodesAgreeOnPrimary();
+            let coordPrimary = coordinatorReplSetTest.getPrimary();
+
             if (makeAParticipantAbort) {
+                // In order to test coordinator failover for a coordinator colocated with a
+                // participant, the participant colocated with the coordinator must fail to prepare,
+                // because prepare does not yet support failover.
+                let nodeToAbort = overrideCoordinatorToBeConfigServer ? participant2 : coordPrimary;
+
                 // Manually abort the transaction on one of the participants, so that the
                 // participant fails to prepare.
-                assert.commandWorked(participant2.adminCommand({
+                assert.commandWorked(nodeToAbort.adminCommand({
                     abortTransaction: 1,
                     lsid: lsid,
                     txnNumber: NumberLong(txnNumber),
@@ -140,9 +157,6 @@
                 }));
             }
 
-            coordinatorReplSetTest.awaitNodesAgreeOnPrimary();
-            let coordPrimary = coordinatorReplSetTest.getPrimary();
-
             assert.commandWorked(coordPrimary.adminCommand({
                 configureFailPoint: failpoint,
                 mode: "alwaysOn",
@@ -213,35 +227,41 @@
         // Run through all the failpoints when all participants respond to prepare with vote commit.
         //
 
-        ++failpointCounter;
-
-        // Note: If the coordinator fails over before making the participant list durable, the
-        // transaction will abort even if all participants could have committed. Further note that
-        // this is a property of the coordinator only - in general, the coordinator is co-located
-        // with a participant and in 4.2, participants abort if they fail over before prepare. This
-        // is really testing that even if the participant's unprepared transaction was able to
-        // survive failover at some future time (for example, in the near future for read-only
-        // transactions, or in the far future if we add support for multi-master), then the
-        // transaction would nevertheless abort due to the design of the coordinator.
-        testCommitProtocol(false /* all participants can commit */,
-                           "hangBeforeWritingParticipantList",
-                           true /* expect abort decision */);
-
-        testCommitProtocol(false /* all participants can commit */,
-                           "hangBeforeWritingDecision",
-                           false /* expect commit decision */);
-        testCommitProtocol(
-            false /* all participants can commit */, "hangBeforeDeletingCoordinatorDoc", false
-            /* expect commit decision */);
-
+        // We only test two-phase commit (as opposed to two-phase abort) if the coordinator is
+        // overridden to be the config server, because prepare does not yet support failover.
+        if (overrideCoordinatorToBeConfigServer) {
+            ++failpointCounter;
+
+            // Note: If the coordinator fails over before making the participant list durable, the
+            // transaction will abort even if all participants could have committed. This is a
+            // property of the coordinator only, and would be true even if a participant's
+            // in-progress transaction could survive failover.
+            testCommitProtocol(false /* all participants can commit */,
+                               "hangBeforeWritingParticipantList",
+                               true /* expect abort decision */);
+
+            testCommitProtocol(false /* all participants can commit */,
+                               "hangBeforeWritingDecision",
+                               false /* expect commit decision */);
+            testCommitProtocol(
+                false /* all participants can commit */, "hangBeforeDeletingCoordinatorDoc", false
+                /* expect commit decision */);
+        }
         st.stop();
     };
 
-    // Same node *always* steps back up after stepping down.
-    runTest(true);
+    //
+    // Coordinator is co-located with a participant
+    //
+
+    runTest(true /* same node always steps up after stepping down */, false);
+    runTest(false /* same node always steps up after stepping down */, false);
+
+    //
+    // Override coordinator to be config server
+    //
 
-    // Same or different node can step back up after stepping down (but most likely a different node
-    // will).
-    runTest(false);
+    runTest(true /* same node always steps up after stepping down */, true);
+    runTest(false /* same node always steps up after stepping down */, true);
 
 })();
author	Esha Maharishi <esha.maharishi@mongodb.com>	2019-01-31 15:04:42 -0500
committer	Esha Maharishi <esha.maharishi@mongodb.com>	2019-01-31 19:01:42 -0500
commit	9606de0f0f3166b9c8fcff033f2476af2937f685 (patch)
tree	187e4e382deaab3626fa5abb0324cf763fb75471
parent	f4656acfee11569a796e06d14e4825ab54d39ecc (diff)
download	mongo-9606de0f0f3166b9c8fcff033f2476af2937f685.tar.gz