summaryrefslogtreecommitdiff
path: root/jstests/replsets
diff options
context:
space:
mode:
authorSiyuan Zhou <siyuan.zhou@mongodb.com>2017-04-20 15:28:38 -0400
committerSiyuan Zhou <siyuan.zhou@mongodb.com>2017-04-20 22:36:43 -0400
commit4680351e3fe6f8f47c04440f1c5d1232a4ab7b2d (patch)
treedae7b10842b2e1899a683adf4a545759182ce2ab /jstests/replsets
parent8b437e7a762e3ef99848659dc0d68df1e2add0a4 (diff)
downloadmongo-4680351e3fe6f8f47c04440f1c5d1232a4ab7b2d.tar.gz
SERVER-26848 Exit catchup mode when not syncing more data.
This reverts commit c08590a6ac9dc54c9d910822d47ea17140b56f89.
Diffstat (limited to 'jstests/replsets')
-rw-r--r--jstests/replsets/catchup.js187
-rw-r--r--jstests/replsets/rslib.js1
2 files changed, 117 insertions, 71 deletions
diff --git a/jstests/replsets/catchup.js b/jstests/replsets/catchup.js
index 542ad51c723..51632379463 100644
--- a/jstests/replsets/catchup.js
+++ b/jstests/replsets/catchup.js
@@ -12,6 +12,7 @@
rst.startSet();
var conf = rst.getReplSetConfig();
+ conf.members[2].priority = 0;
conf.settings = {
heartbeatIntervalMillis: 500,
electionTimeoutMillis: 10000,
@@ -34,7 +35,7 @@
node.adminCommand(verbosity);
});
- function stepUp(node) {
+ function stepUpNode(node) {
assert.soon(function() {
node.adminCommand({replSetStepUp: 1});
return node.adminCommand('replSetGetStatus').myState == ReplSetTest.State.PRIMARY;
@@ -43,12 +44,6 @@
return node;
}
- function doWrites(node) {
- for (var i = 0; i < 3; i++) {
- assert.writeOK(node.getDB("test").foo.insert({x: i}));
- }
- }
-
function checkOpInOplog(node, op, count) {
node.getDB("admin").getMongo().setSlaveOk();
var oplog = node.getDB("local")['oplog.rs'];
@@ -56,98 +51,148 @@
assert.eq(oplog.count(op), count, "op: " + tojson(op) + ", oplog: " + tojson(oplogArray));
}
- function isEarlierTimestamp(ts1, ts2) {
- if (ts1.getTime() == ts2.getTime()) {
- return ts1.getInc() < ts2.getInc();
+ // Stop replication on secondaries, do writes and step up one of the secondaries.
+ //
+ // The old primary has extra writes that are not replicated to the other nodes yet,
+ // but the new primary steps up, getting the vote from the the third node "voter".
+ function stopRelicationAndEnforceNewPrimaryToCatchUp() {
+ // Write documents that cannot be replicated to secondaries in time.
+ var oldSecondaries = rst.getSecondaries();
+ var oldPrimary = rst.getPrimary();
+ stopServerReplication(oldSecondaries);
+ for (var i = 0; i < 3; i++) {
+ assert.writeOK(oldPrimary.getDB("test").foo.insert({x: i}));
}
- return ts1.getTime() < ts2.getTime();
+ var latestOpOnOldPrimary = getLatestOp(oldPrimary);
+ // New primary wins immediately, but needs to catch up.
+ var newPrimary = stepUpNode(oldSecondaries[0]);
+ rst.awaitNodesAgreeOnPrimary();
+ var latestOpOnNewPrimary = getLatestOp(newPrimary);
+ // Check this node is not writable.
+ assert.eq(newPrimary.getDB("test").isMaster().ismaster, false);
+
+ return {
+ oldSecondaries: oldSecondaries,
+ oldPrimary: oldPrimary,
+ newPrimary: newPrimary,
+ voter: oldSecondaries[1],
+ latestOpOnOldPrimary: latestOpOnOldPrimary,
+ latestOpOnNewPrimary: latestOpOnNewPrimary
+ };
+ }
+
+ function reconfigCatchUpTimeoutMillis(timeout) {
+ // Reconnect all nodes to make sure reconfig succeeds.
+ rst.nodes.forEach(reconnect);
+ // Reconfigure replicaset to decrease catchup timeout
+ conf = rst.getReplSetConfigFromNode();
+ conf.version++;
+ conf.settings.catchUpTimeoutMillis = timeout;
+ reconfig(rst, conf);
+ rst.awaitReplication();
+ rst.awaitNodesAgreeOnPrimary();
}
- rst.awaitReplication(ReplSetTest.kDefaultTimeoutMS, ReplSetTest.OpTimeType.LAST_DURABLE);
+ rst.awaitReplication();
- jsTest.log("Case 1: The primary is up-to-date after freshness scan.");
+ jsTest.log("Case 1: The primary is up-to-date after refreshing heartbeats.");
// Should complete transition to primary immediately.
- var newPrimary = stepUp(rst.getSecondary());
+ var newPrimary = stepUpNode(rst.getSecondary());
rst.awaitNodesAgreeOnPrimary();
// Should win an election and finish the transition very quickly.
assert.eq(newPrimary, rst.getPrimary());
- rst.awaitReplication(ReplSetTest.kDefaultTimeoutMS, ReplSetTest.OpTimeType.LAST_DURABLE);
+ rst.awaitReplication();
jsTest.log("Case 2: The primary needs to catch up, succeeds in time.");
- // Write documents that cannot be replicated to secondaries in time.
- var originalSecondaries = rst.getSecondaries();
- stopServerReplication(originalSecondaries);
- doWrites(rst.getPrimary());
- var latestOp = getLatestOp(rst.getPrimary());
- // New primary wins immediately, but needs to catch up.
- newPrimary = stepUp(rst.getSecondary());
- rst.awaitNodesAgreeOnPrimary();
- // Check this node is not writable.
- assert.eq(newPrimary.getDB("test").isMaster().ismaster, false);
+ var stepUpResults = stopRelicationAndEnforceNewPrimaryToCatchUp();
+
// Disable fail point to allow replication.
- restartServerReplication(originalSecondaries);
+ restartServerReplication(stepUpResults.oldSecondaries);
// getPrimary() blocks until the primary finishes drain mode.
- assert.eq(newPrimary, rst.getPrimary());
+ assert.eq(stepUpResults.newPrimary, rst.getPrimary());
// Wait for all secondaries to catch up
rst.awaitReplication();
// Check the latest op on old primary is preserved on the new one.
- checkOpInOplog(newPrimary, latestOp, 1);
- rst.awaitReplication(ReplSetTest.kDefaultTimeoutMS, ReplSetTest.OpTimeType.LAST_DURABLE);
+ checkOpInOplog(stepUpResults.newPrimary, stepUpResults.latestOpOnOldPrimary, 1);
+ rst.awaitReplication();
jsTest.log("Case 3: The primary needs to catch up, but has to change sync source to catch up.");
- // Write documents that cannot be replicated to secondaries in time.
- stopServerReplication(rst.getSecondaries());
- doWrites(rst.getPrimary());
- var oldPrimary = rst.getPrimary();
- originalSecondaries = rst.getSecondaries();
- latestOp = getLatestOp(oldPrimary);
- newPrimary = stepUp(originalSecondaries[0]);
- rst.awaitNodesAgreeOnPrimary();
- // Disable fail point on one of the other secondaries.
- // Wait until it catches up with the old primary.
- restartServerReplication(originalSecondaries[1]);
- assert.commandWorked(originalSecondaries[1].adminCommand({replSetSyncFrom: oldPrimary.host}));
- awaitOpTime(originalSecondaries[1], latestOp.ts);
+ stepUpResults = stopRelicationAndEnforceNewPrimaryToCatchUp();
+
+ // Disable fail point on the voter. Wait until it catches up with the old primary.
+ restartServerReplication(stepUpResults.voter);
+ assert.commandWorked(
+ stepUpResults.voter.adminCommand({replSetSyncFrom: stepUpResults.oldPrimary.host}));
+ awaitOpTime(stepUpResults.voter, stepUpResults.latestOpOnOldPrimary.ts);
// Disconnect the new primary and the old one.
- oldPrimary.disconnect(newPrimary);
+ stepUpResults.oldPrimary.disconnect(stepUpResults.newPrimary);
// Disable the failpoint, the new primary should sync from the other secondary.
- restartServerReplication(newPrimary);
- assert.eq(newPrimary, rst.getPrimary());
- checkOpInOplog(newPrimary, latestOp, 1);
+ restartServerReplication(stepUpResults.newPrimary);
+ assert.eq(stepUpResults.newPrimary, rst.getPrimary());
+ checkOpInOplog(stepUpResults.newPrimary, stepUpResults.latestOpOnOldPrimary, 1);
// Restore the broken connection
- oldPrimary.reconnect(newPrimary);
- rst.awaitReplication(ReplSetTest.kDefaultTimeoutMS, ReplSetTest.OpTimeType.LAST_DURABLE);
+ stepUpResults.oldPrimary.reconnect(stepUpResults.newPrimary);
+ rst.awaitReplication();
jsTest.log("Case 4: The primary needs to catch up, fails due to timeout.");
- // Reconfigure replicaset to decrease catchup timeout
- conf = rst.getReplSetConfigFromNode();
- conf.version++;
- conf.settings.catchUpTimeoutMillis = 10 * 1000;
- reconfig(rst, conf);
- rst.awaitReplication(ReplSetTest.kDefaultTimeoutMS, ReplSetTest.OpTimeType.LAST_DURABLE);
- rst.awaitNodesAgreeOnPrimary();
+ reconfigCatchUpTimeoutMillis(10 * 1000);
- // Write documents that cannot be replicated to secondaries in time.
- originalSecondaries = rst.getSecondaries();
- stopServerReplication(originalSecondaries);
- doWrites(rst.getPrimary());
- latestOp = getLatestOp(rst.getPrimary());
-
- // New primary wins immediately, but needs to catch up.
- newPrimary = stepUp(originalSecondaries[0]);
- rst.awaitNodesAgreeOnPrimary();
- var latestOpOnNewPrimary = getLatestOp(newPrimary);
+ stepUpResults = stopRelicationAndEnforceNewPrimaryToCatchUp();
// Wait until the new primary completes the transition to primary and writes a no-op.
- checkLog.contains(newPrimary, "Cannot catch up oplog after becoming primary");
- restartServerReplication(newPrimary);
- assert.eq(newPrimary, rst.getPrimary());
+ checkLog.contains(stepUpResults.newPrimary, "Catchup timed out after becoming primary");
+ restartServerReplication(stepUpResults.newPrimary);
+ assert.eq(stepUpResults.newPrimary, rst.getPrimary());
+
+ // Wait for the no-op "new primary" after winning an election, so that we know it has
+ // finished transition to primary.
+ assert.soon(function() {
+ return rs.compareOpTimes(stepUpResults.latestOpOnOldPrimary,
+ getLatestOp(stepUpResults.newPrimary)) < 0;
+ });
+ // The extra oplog entries on the old primary are not replicated to the new one.
+ checkOpInOplog(stepUpResults.newPrimary, stepUpResults.latestOpOnOldPrimary, 0);
+ restartServerReplication(stepUpResults.voter);
+ rst.awaitReplication();
+
+ jsTest.log("Case 5: The primary needs to catch up with no timeout, then gets aborted.");
+ reconfigCatchUpTimeoutMillis(-1);
+ stepUpResults = stopRelicationAndEnforceNewPrimaryToCatchUp();
+
+ // Abort catchup.
+ assert.commandWorked(stepUpResults.newPrimary.adminCommand({replSetAbortPrimaryCatchUp: 1}));
// Wait for the no-op "new primary" after winning an election, so that we know it has
// finished transition to primary.
assert.soon(function() {
- return isEarlierTimestamp(latestOpOnNewPrimary.ts, getLatestOp(newPrimary).ts);
+ return rs.compareOpTimes(stepUpResults.latestOpOnOldPrimary,
+ getLatestOp(stepUpResults.newPrimary)) < 0;
});
// The extra oplog entries on the old primary are not replicated to the new one.
- checkOpInOplog(newPrimary, latestOp, 0);
- restartServerReplication(originalSecondaries[1]);
+ checkOpInOplog(stepUpResults.newPrimary, stepUpResults.latestOpOnOldPrimary, 0);
+ restartServerReplication(stepUpResults.oldSecondaries);
+ rst.awaitReplication();
+ checkOpInOplog(stepUpResults.newPrimary, stepUpResults.latestOpOnOldPrimary, 0);
+
+ // TODO: Uncomment case 6 when SERVER-28751 gets fixed.
+ //
+ // jsTest.log("Case 6: The primary needs to catch up with no timeout, but steps down.");
+ // var stepUpResults = stopRelicationAndEnforceNewPrimaryToCatchUp();
+
+ // // Step-down command should abort catchup.
+ // try {
+ // printjson(stepUpResults.newPrimary.adminCommand({replSetStepDown: 60}));
+ // } catch (e) {
+ // print(e);
+ // }
+ // // Rename the primary.
+ // var steppedDownPrimary = stepUpResults.newPrimary;
+ // var newPrimary = rst.getPrimary();
+ // assert.neq(newPrimary, steppedDownPrimary);
+
+ // // Enable data replication on the stepped down primary and make sure it syncs old writes.
+ // rst.nodes.forEach(reconnect);
+ // restartServerReplication(stepUpResults.oldSecondaries);
+ // rst.awaitReplication();
+ // checkOpInOplog(steppedDownPrimary, stepUpResults.latestOpOnOldPrimary, 1);
+
})();
diff --git a/jstests/replsets/rslib.js b/jstests/replsets/rslib.js
index 1471824bd8f..5911723d717 100644
--- a/jstests/replsets/rslib.js
+++ b/jstests/replsets/rslib.js
@@ -162,6 +162,7 @@ var getLastOpTime;
if (!isNetworkError(e)) {
throw e;
}
+ print("Calling replSetReconfig failed. " + tojson(e));
}
var master = rs.getPrimary().getDB("admin");