diff options
author | Esha Maharishi <esha.maharishi@mongodb.com> | 2019-02-05 09:33:17 -0500 |
---|---|---|
committer | Esha Maharishi <esha.maharishi@mongodb.com> | 2019-02-07 14:53:39 -0500 |
commit | b5563fde2a1a0beac8ce58db619300adc9f3a7d9 (patch) | |
tree | 24e9b194eb1e810d86b3eeb4cade576cf61f18c5 /jstests/sharding | |
parent | 24a0276f123949081c1cd0d1c7876551b3f065a1 (diff) | |
download | mongo-b5563fde2a1a0beac8ce58db619300adc9f3a7d9.tar.gz |
SERVER-39053 Transaction coordinator should retry sending decision until definitively hearing an ack
Diffstat (limited to 'jstests/sharding')
4 files changed, 89 insertions, 52 deletions
diff --git a/jstests/sharding/libs/sharded_transactions_helpers.js b/jstests/sharding/libs/sharded_transactions_helpers.js index b0e24aaf71a..05263868d30 100644 --- a/jstests/sharding/libs/sharded_transactions_helpers.js +++ b/jstests/sharding/libs/sharded_transactions_helpers.js @@ -1,6 +1,47 @@ const kSnapshotErrors = [ErrorCodes.SnapshotTooOld, ErrorCodes.SnapshotUnavailable, ErrorCodes.StaleChunkHistory]; +// List of failpoints in the coordinator's two-phase commit code. The associated data describes how +// many times each failpoint would be hit assuming a 3-participant transaction where one of the +// participants is co-located with the coordinator: +// - numTimesShouldBeHit: N means the failpoint should be hit N times during that phase; for +// example, if there are two remote participants, the hangWhileTargetingRemoteHost failpoint would +// be hit two times in the prepare phase. +// - skip: N means turn on the failpoint after the failpoint has been hit N times; it's used to turn +// on the remote and local targeting failpoints for the prepare and decision phase separately. +function getCoordinatorFailpoints() { + const coordinatorFailpointDataArr = [ + {failpoint: "hangBeforeWritingParticipantList", numTimesShouldBeHit: 1}, + { + // Test targeting remote nodes for prepare + failpoint: "hangWhileTargetingRemoteHost", + numTimesShouldBeHit: 2 /* once per remote participant */ + }, + { + // Test targeting local node for prepare + failpoint: "hangWhileTargetingLocalHost", + numTimesShouldBeHit: 1 + }, + {failpoint: "hangBeforeWritingDecision", numTimesShouldBeHit: 1}, + { + // Test targeting remote nodes for decision + failpoint: "hangWhileTargetingRemoteHost", + numTimesShouldBeHit: 2, /* once per remote participant */ + skip: 2 /* to skip when the failpoint is hit for prepare */ + }, + { + // Test targeting local node for decision + failpoint: "hangWhileTargetingLocalHost", + numTimesShouldBeHit: 1, + skip: 1 /* to skip when the failpoint is hit for prepare */ + }, + {failpoint: "hangBeforeDeletingCoordinatorDoc", numTimesShouldBeHit: 1}, + ]; + + // Return a deep copy of the array, so that the caller is free to modify its contents. + return coordinatorFailpointDataArr.map(failpoint => Object.assign({}, failpoint)); +} + function setFailCommandOnShards(st, mode, commands, code, numShards) { for (let i = 0; i < numShards; i++) { const shardConn = st["rs" + i].getPrimary(); diff --git a/jstests/sharding/txn_basic_two_phase_commit.js b/jstests/sharding/txn_basic_two_phase_commit.js index 530399df016..09f4f1bf0cf 100644 --- a/jstests/sharding/txn_basic_two_phase_commit.js +++ b/jstests/sharding/txn_basic_two_phase_commit.js @@ -175,7 +175,7 @@ } if (simulateNetworkFailures) { - startSimulatingNetworkFailures([participant1, participant2, coordinator]); + startSimulatingNetworkFailures([participant1, participant2]); } // Turn on failpoints so that the coordinator hangs after each write it does, so that the @@ -222,7 +222,7 @@ }); if (simulateNetworkFailures) { - stopSimulatingNetworkFailures([participant1, participant2, coordinator]); + stopSimulatingNetworkFailures([participant1, participant2]); } // Check that the transaction committed or aborted as expected. diff --git a/jstests/sharding/txn_commit_coordination_is_robust_to_killop.js b/jstests/sharding/txn_commit_coordination_is_robust_to_killop.js index ac9546c230a..415b99286b5 100644 --- a/jstests/sharding/txn_commit_coordination_is_robust_to_killop.js +++ b/jstests/sharding/txn_commit_coordination_is_robust_to_killop.js @@ -161,22 +161,7 @@ st.s.getDB(dbName).getCollection(collName).drop(); }; - const failpointDataArr = [ - {failpoint: "hangBeforeWritingParticipantList", numTimesShouldBeHit: 1}, - {failpoint: "hangBeforeWritingDecision", numTimesShouldBeHit: 1}, - {failpoint: "hangBeforeDeletingCoordinatorDoc", numTimesShouldBeHit: 1}, - { - // Test targeting for prepare - failpoint: "hangWhileTargetingRemoteHost", - numTimesShouldBeHit: 2 /* once per ~remote~ participant */ - }, - { - // Test targeting for decision (skip the first two times the failpoint is hit) - failpoint: "hangWhileTargetingRemoteHost", - numTimesShouldBeHit: 2, /* once per ~remote~ participant */ - skip: 2 - }, - ]; + const failpointDataArr = getCoordinatorFailpoints(); // Test abort path. diff --git a/jstests/sharding/txn_failover_two_phase_commit.js b/jstests/sharding/txn_failover_two_phase_commit.js index 0fc9f1b7a2c..ba185f33aed 100644 --- a/jstests/sharding/txn_failover_two_phase_commit.js +++ b/jstests/sharding/txn_failover_two_phase_commit.js @@ -25,8 +25,6 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true; // the transaction timeout. TestData.transactionLifetimeLimitSeconds = 15; - let failpointCounter = 0; - let lsid = {id: UUID()}; let txnNumber = 0; @@ -127,12 +125,13 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true; })); }; - const testCommitProtocol = function(makeAParticipantAbort, failpoint, expectAbortResponse) { + const testCommitProtocol = function( + makeAParticipantAbort, failpointData, expectAbortResponse) { jsTest.log("Testing commit protocol with sameNodeStepsUpAfterFailover: " + sameNodeStepsUpAfterFailover + ", overrideCoordinatorToBeConfigServer: " + overrideCoordinatorToBeConfigServer + ", makeAParticipantAbort: " + makeAParticipantAbort + ", expectAbortResponse: " + expectAbortResponse + - ", and failpoint: " + failpoint); + ", and failpointData: " + tojson(failpointData)); txnNumber++; setUp(); @@ -158,8 +157,9 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true; } assert.commandWorked(coordPrimary.adminCommand({ - configureFailPoint: failpoint, + configureFailPoint: failpointData.failpoint, mode: "alwaysOn", + skip: (failpointData.skip ? failpointData.skip : 0), })); // Run commitTransaction through a parallel shell. @@ -171,7 +171,8 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true; } // Wait for the desired failpoint to be hit. - waitForFailpoint("Hit " + failpoint + " failpoint", failpointCounter); + waitForFailpoint("Hit " + failpointData.failpoint + " failpoint", + failpointData.numTimesShouldBeHit); // Induce the coordinator primary to step down. const stepDownResult = assert.throws(function() { @@ -181,7 +182,7 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true; 'Expected exception from stepping down coordinator primary ' + coordPrimary.host + ': ' + tojson(stepDownResult)); assert.commandWorked(coordPrimary.adminCommand({ - configureFailPoint: failpoint, + configureFailPoint: failpointData.failpoint, mode: "off", })); @@ -205,23 +206,24 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true; } st.s.getDB(dbName).getCollection(collName).drop(); + clearRawMongoProgramOutput(); }; // // Run through all the failpoints when one participant responds to prepare with vote abort. // - ++failpointCounter; - - testCommitProtocol(true /* make a participant abort */, - "hangBeforeWritingParticipantList", - true /* expect abort decision */); - testCommitProtocol(true /* make a participant abort */, - "hangBeforeWritingDecision", - true /* expect abort decision */); - testCommitProtocol(true /* make a participant abort */, - "hangBeforeDeletingCoordinatorDoc", - true /* expect abort decision */); + failpointDataArr.forEach(function(failpointData) { + if (overrideCoordinatorToBeConfigServer && + failpointData.failpoint == "hangWhileTargetingLocalHost") { + // If the coordinator is overridden to be the config server, it will never target + // itself, so don't test the target local path. + return; + } + testCommitProtocol(true /* make a participant abort */, + failpointData, + true /* expect abort decision */); + }); // // Run through all the failpoints when all participants respond to prepare with vote commit. @@ -230,26 +232,27 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true; // We only test two-phase commit (as opposed to two-phase abort) if the coordinator is // overridden to be the config server, because prepare does not yet support failover. if (overrideCoordinatorToBeConfigServer) { - ++failpointCounter; - - // Note: If the coordinator fails over before making the participant list durable, the - // transaction will abort even if all participants could have committed. This is a - // property of the coordinator only, and would be true even if a participant's - // in-progress transaction could survive failover. - testCommitProtocol(false /* all participants can commit */, - "hangBeforeWritingParticipantList", - true /* expect abort decision */); - - testCommitProtocol(false /* all participants can commit */, - "hangBeforeWritingDecision", - false /* expect commit decision */); - testCommitProtocol( - false /* all participants can commit */, "hangBeforeDeletingCoordinatorDoc", false - /* expect commit decision */); + failpointDataArr.forEach(function(failpointData) { + if (failpointData.failpoint == "hangWhileTargetingLocalHost") { + // If the coordinator is overridden to be the config server, it will never + // target itself, so don't test the target local path. + return; + } + // Note: If the coordinator fails over before making the participant list durable, + // the transaction will abort even if all participants could have committed. This is + // a property of the coordinator only, and would be true even if a participant's + // in-progress transaction could survive failover. + let expectAbort = + (failpointData.failpoint == "hangBeforeWritingParticipantList") || false; + testCommitProtocol( + false /* make a participant abort */, failpointData, expectAbort); + }); } st.stop(); }; + const failpointDataArr = getCoordinatorFailpoints(); + // // Coordinator is co-located with a participant // @@ -261,6 +264,14 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true; // Override coordinator to be config server // + // If the coordinator is overridden to be the config server, it will send a remote request + // rather than local request, so there is one additional remote request. + failpointDataArr.forEach(function(failpointData) { + if (failpointData.failpoint == "hangWhileTargetingRemoteHost") { + failpointData.numTimesShouldBeHit++; + } + }); + runTest(true /* same node always steps up after stepping down */, true); runTest(false /* same node always steps up after stepping down */, true); |