summaryrefslogtreecommitdiff
path: root/jstests/sharding
diff options
context:
space:
mode:
authorEsha Maharishi <esha.maharishi@mongodb.com>2019-02-05 09:33:17 -0500
committerEsha Maharishi <esha.maharishi@mongodb.com>2019-02-07 14:53:39 -0500
commitb5563fde2a1a0beac8ce58db619300adc9f3a7d9 (patch)
tree24e9b194eb1e810d86b3eeb4cade576cf61f18c5 /jstests/sharding
parent24a0276f123949081c1cd0d1c7876551b3f065a1 (diff)
downloadmongo-b5563fde2a1a0beac8ce58db619300adc9f3a7d9.tar.gz
SERVER-39053 Transaction coordinator should retry sending decision until definitively hearing an ack
Diffstat (limited to 'jstests/sharding')
-rw-r--r--jstests/sharding/libs/sharded_transactions_helpers.js41
-rw-r--r--jstests/sharding/txn_basic_two_phase_commit.js4
-rw-r--r--jstests/sharding/txn_commit_coordination_is_robust_to_killop.js17
-rw-r--r--jstests/sharding/txn_failover_two_phase_commit.js79
4 files changed, 89 insertions, 52 deletions
diff --git a/jstests/sharding/libs/sharded_transactions_helpers.js b/jstests/sharding/libs/sharded_transactions_helpers.js
index b0e24aaf71a..05263868d30 100644
--- a/jstests/sharding/libs/sharded_transactions_helpers.js
+++ b/jstests/sharding/libs/sharded_transactions_helpers.js
@@ -1,6 +1,47 @@
const kSnapshotErrors =
[ErrorCodes.SnapshotTooOld, ErrorCodes.SnapshotUnavailable, ErrorCodes.StaleChunkHistory];
+// List of failpoints in the coordinator's two-phase commit code. The associated data describes how
+// many times each failpoint would be hit assuming a 3-participant transaction where one of the
+// participants is co-located with the coordinator:
+// - numTimesShouldBeHit: N means the failpoint should be hit N times during that phase; for
+// example, if there are two remote participants, the hangWhileTargetingRemoteHost failpoint would
+// be hit two times in the prepare phase.
+// - skip: N means turn on the failpoint after the failpoint has been hit N times; it's used to turn
+// on the remote and local targeting failpoints for the prepare and decision phase separately.
+function getCoordinatorFailpoints() {
+ const coordinatorFailpointDataArr = [
+ {failpoint: "hangBeforeWritingParticipantList", numTimesShouldBeHit: 1},
+ {
+ // Test targeting remote nodes for prepare
+ failpoint: "hangWhileTargetingRemoteHost",
+ numTimesShouldBeHit: 2 /* once per remote participant */
+ },
+ {
+ // Test targeting local node for prepare
+ failpoint: "hangWhileTargetingLocalHost",
+ numTimesShouldBeHit: 1
+ },
+ {failpoint: "hangBeforeWritingDecision", numTimesShouldBeHit: 1},
+ {
+ // Test targeting remote nodes for decision
+ failpoint: "hangWhileTargetingRemoteHost",
+ numTimesShouldBeHit: 2, /* once per remote participant */
+ skip: 2 /* to skip when the failpoint is hit for prepare */
+ },
+ {
+ // Test targeting local node for decision
+ failpoint: "hangWhileTargetingLocalHost",
+ numTimesShouldBeHit: 1,
+ skip: 1 /* to skip when the failpoint is hit for prepare */
+ },
+ {failpoint: "hangBeforeDeletingCoordinatorDoc", numTimesShouldBeHit: 1},
+ ];
+
+ // Return a deep copy of the array, so that the caller is free to modify its contents.
+ return coordinatorFailpointDataArr.map(failpoint => Object.assign({}, failpoint));
+}
+
function setFailCommandOnShards(st, mode, commands, code, numShards) {
for (let i = 0; i < numShards; i++) {
const shardConn = st["rs" + i].getPrimary();
diff --git a/jstests/sharding/txn_basic_two_phase_commit.js b/jstests/sharding/txn_basic_two_phase_commit.js
index 530399df016..09f4f1bf0cf 100644
--- a/jstests/sharding/txn_basic_two_phase_commit.js
+++ b/jstests/sharding/txn_basic_two_phase_commit.js
@@ -175,7 +175,7 @@
}
if (simulateNetworkFailures) {
- startSimulatingNetworkFailures([participant1, participant2, coordinator]);
+ startSimulatingNetworkFailures([participant1, participant2]);
}
// Turn on failpoints so that the coordinator hangs after each write it does, so that the
@@ -222,7 +222,7 @@
});
if (simulateNetworkFailures) {
- stopSimulatingNetworkFailures([participant1, participant2, coordinator]);
+ stopSimulatingNetworkFailures([participant1, participant2]);
}
// Check that the transaction committed or aborted as expected.
diff --git a/jstests/sharding/txn_commit_coordination_is_robust_to_killop.js b/jstests/sharding/txn_commit_coordination_is_robust_to_killop.js
index ac9546c230a..415b99286b5 100644
--- a/jstests/sharding/txn_commit_coordination_is_robust_to_killop.js
+++ b/jstests/sharding/txn_commit_coordination_is_robust_to_killop.js
@@ -161,22 +161,7 @@
st.s.getDB(dbName).getCollection(collName).drop();
};
- const failpointDataArr = [
- {failpoint: "hangBeforeWritingParticipantList", numTimesShouldBeHit: 1},
- {failpoint: "hangBeforeWritingDecision", numTimesShouldBeHit: 1},
- {failpoint: "hangBeforeDeletingCoordinatorDoc", numTimesShouldBeHit: 1},
- {
- // Test targeting for prepare
- failpoint: "hangWhileTargetingRemoteHost",
- numTimesShouldBeHit: 2 /* once per ~remote~ participant */
- },
- {
- // Test targeting for decision (skip the first two times the failpoint is hit)
- failpoint: "hangWhileTargetingRemoteHost",
- numTimesShouldBeHit: 2, /* once per ~remote~ participant */
- skip: 2
- },
- ];
+ const failpointDataArr = getCoordinatorFailpoints();
// Test abort path.
diff --git a/jstests/sharding/txn_failover_two_phase_commit.js b/jstests/sharding/txn_failover_two_phase_commit.js
index 0fc9f1b7a2c..ba185f33aed 100644
--- a/jstests/sharding/txn_failover_two_phase_commit.js
+++ b/jstests/sharding/txn_failover_two_phase_commit.js
@@ -25,8 +25,6 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true;
// the transaction timeout.
TestData.transactionLifetimeLimitSeconds = 15;
- let failpointCounter = 0;
-
let lsid = {id: UUID()};
let txnNumber = 0;
@@ -127,12 +125,13 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true;
}));
};
- const testCommitProtocol = function(makeAParticipantAbort, failpoint, expectAbortResponse) {
+ const testCommitProtocol = function(
+ makeAParticipantAbort, failpointData, expectAbortResponse) {
jsTest.log("Testing commit protocol with sameNodeStepsUpAfterFailover: " +
sameNodeStepsUpAfterFailover + ", overrideCoordinatorToBeConfigServer: " +
overrideCoordinatorToBeConfigServer + ", makeAParticipantAbort: " +
makeAParticipantAbort + ", expectAbortResponse: " + expectAbortResponse +
- ", and failpoint: " + failpoint);
+ ", and failpointData: " + tojson(failpointData));
txnNumber++;
setUp();
@@ -158,8 +157,9 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true;
}
assert.commandWorked(coordPrimary.adminCommand({
- configureFailPoint: failpoint,
+ configureFailPoint: failpointData.failpoint,
mode: "alwaysOn",
+ skip: (failpointData.skip ? failpointData.skip : 0),
}));
// Run commitTransaction through a parallel shell.
@@ -171,7 +171,8 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true;
}
// Wait for the desired failpoint to be hit.
- waitForFailpoint("Hit " + failpoint + " failpoint", failpointCounter);
+ waitForFailpoint("Hit " + failpointData.failpoint + " failpoint",
+ failpointData.numTimesShouldBeHit);
// Induce the coordinator primary to step down.
const stepDownResult = assert.throws(function() {
@@ -181,7 +182,7 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true;
'Expected exception from stepping down coordinator primary ' +
coordPrimary.host + ': ' + tojson(stepDownResult));
assert.commandWorked(coordPrimary.adminCommand({
- configureFailPoint: failpoint,
+ configureFailPoint: failpointData.failpoint,
mode: "off",
}));
@@ -205,23 +206,24 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true;
}
st.s.getDB(dbName).getCollection(collName).drop();
+ clearRawMongoProgramOutput();
};
//
// Run through all the failpoints when one participant responds to prepare with vote abort.
//
- ++failpointCounter;
-
- testCommitProtocol(true /* make a participant abort */,
- "hangBeforeWritingParticipantList",
- true /* expect abort decision */);
- testCommitProtocol(true /* make a participant abort */,
- "hangBeforeWritingDecision",
- true /* expect abort decision */);
- testCommitProtocol(true /* make a participant abort */,
- "hangBeforeDeletingCoordinatorDoc",
- true /* expect abort decision */);
+ failpointDataArr.forEach(function(failpointData) {
+ if (overrideCoordinatorToBeConfigServer &&
+ failpointData.failpoint == "hangWhileTargetingLocalHost") {
+ // If the coordinator is overridden to be the config server, it will never target
+ // itself, so don't test the target local path.
+ return;
+ }
+ testCommitProtocol(true /* make a participant abort */,
+ failpointData,
+ true /* expect abort decision */);
+ });
//
// Run through all the failpoints when all participants respond to prepare with vote commit.
@@ -230,26 +232,27 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true;
// We only test two-phase commit (as opposed to two-phase abort) if the coordinator is
// overridden to be the config server, because prepare does not yet support failover.
if (overrideCoordinatorToBeConfigServer) {
- ++failpointCounter;
-
- // Note: If the coordinator fails over before making the participant list durable, the
- // transaction will abort even if all participants could have committed. This is a
- // property of the coordinator only, and would be true even if a participant's
- // in-progress transaction could survive failover.
- testCommitProtocol(false /* all participants can commit */,
- "hangBeforeWritingParticipantList",
- true /* expect abort decision */);
-
- testCommitProtocol(false /* all participants can commit */,
- "hangBeforeWritingDecision",
- false /* expect commit decision */);
- testCommitProtocol(
- false /* all participants can commit */, "hangBeforeDeletingCoordinatorDoc", false
- /* expect commit decision */);
+ failpointDataArr.forEach(function(failpointData) {
+ if (failpointData.failpoint == "hangWhileTargetingLocalHost") {
+ // If the coordinator is overridden to be the config server, it will never
+ // target itself, so don't test the target local path.
+ return;
+ }
+ // Note: If the coordinator fails over before making the participant list durable,
+ // the transaction will abort even if all participants could have committed. This is
+ // a property of the coordinator only, and would be true even if a participant's
+ // in-progress transaction could survive failover.
+ let expectAbort =
+ (failpointData.failpoint == "hangBeforeWritingParticipantList") || false;
+ testCommitProtocol(
+ false /* make a participant abort */, failpointData, expectAbort);
+ });
}
st.stop();
};
+ const failpointDataArr = getCoordinatorFailpoints();
+
//
// Coordinator is co-located with a participant
//
@@ -261,6 +264,14 @@ TestData.skipCheckingUUIDsConsistentAcrossCluster = true;
// Override coordinator to be config server
//
+ // If the coordinator is overridden to be the config server, it will send a remote request
+ // rather than local request, so there is one additional remote request.
+ failpointDataArr.forEach(function(failpointData) {
+ if (failpointData.failpoint == "hangWhileTargetingRemoteHost") {
+ failpointData.numTimesShouldBeHit++;
+ }
+ });
+
runTest(true /* same node always steps up after stepping down */, true);
runTest(false /* same node always steps up after stepping down */, true);