summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilliam Schultz <william.schultz@mongodb.com>2020-04-27 12:17:43 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-04-27 19:32:41 +0000
commita30cef00b9e3ccfe60a4dce7f9f2c7cbaf1e68b7 (patch)
tree70f6fb883c17fe301ba5f7742fda4a0e1dbb7843
parentd140082164a67ebc5eb935aa7982b208f18ea274 (diff)
downloadmongo-a30cef00b9e3ccfe60a4dce7f9f2c7cbaf1e68b7.tar.gz
SERVER-44068 Speed up transition to 'kSteadyStateOps' in RollbackTest
-rw-r--r--jstests/libs/write_concern_util.js8
-rw-r--r--jstests/replsets/libs/rollback_test.js74
-rw-r--r--src/mongo/db/repl/bgsync.cpp18
-rw-r--r--src/mongo/db/repl/bgsync.h2
-rw-r--r--src/mongo/shell/check_log.js15
-rw-r--r--src/mongo/shell/replsettest.js10
6 files changed, 77 insertions, 50 deletions
diff --git a/jstests/libs/write_concern_util.js b/jstests/libs/write_concern_util.js
index 90832a858d8..dfbae445d2e 100644
--- a/jstests/libs/write_concern_util.js
+++ b/jstests/libs/write_concern_util.js
@@ -18,7 +18,8 @@ function shardCollectionWithChunks(st, coll, numDocs) {
}
// Stops replication on the given server(s).
-function stopServerReplication(conn) {
+function stopServerReplication(conn, retryIntervalMS) {
+ retryIntervalMS = retryIntervalMS || 300;
if (conn.length) {
conn.forEach(function(n) {
stopServerReplication(n);
@@ -37,7 +38,10 @@ function stopServerReplication(conn) {
// the fail point won't be hit until the node transitions from being the primary.
if (assert.commandWorked(conn.adminCommand('replSetGetStatus')).myState !=
ReplSetTest.State.PRIMARY) {
- checkLog.contains(conn, 'bgsync - stopReplProducer fail point enabled');
+ checkLog.contains(conn,
+ 'bgsync - stopReplProducer fail point enabled',
+ ReplSetTest.kDefaultTimeoutMS,
+ retryIntervalMS);
}
}
diff --git a/jstests/replsets/libs/rollback_test.js b/jstests/replsets/libs/rollback_test.js
index 75285ee1714..18f51de84e9 100644
--- a/jstests/replsets/libs/rollback_test.js
+++ b/jstests/replsets/libs/rollback_test.js
@@ -43,6 +43,7 @@
load("jstests/replsets/rslib.js");
load("jstests/replsets/libs/two_phase_drops.js");
load("jstests/hooks/validate_collections.js");
+load('jstests/libs/fail_point_util.js');
/**
*
@@ -90,6 +91,7 @@ function RollbackTest(name = "RollbackTest", replSet) {
const SIGTERM = 15;
const kNumDataBearingNodes = 3;
const kElectableNodes = 2;
+ const kRetryIntervalMS = 25;
let awaitSecondaryNodesForRollbackTimeout;
@@ -155,6 +157,11 @@ function RollbackTest(name = "RollbackTest", replSet) {
waitForState(curSecondary, ReplSetTest.State.SECONDARY);
waitForState(tiebreakerNode, ReplSetTest.State.SECONDARY);
+ // Make sync source selection faster.
+ replSet.nodes.forEach((node) => {
+ setFailPoint(node, "forceBgSyncSyncSourceRetryWaitMS", {sleepMS: kRetryIntervalMS});
+ });
+
rst = replSet;
lastRBID = assert.commandWorked(curSecondary.adminCommand("replSetGetRBID")).rbid;
@@ -271,10 +278,10 @@ function RollbackTest(name = "RollbackTest", replSet) {
assert.soonNoExcept(() => {
const res = conn.adminCommand({replSetStepUp: 1});
return res.ok;
- }, `failed to step up node ${conn.host}`, ReplSetTest.kDefaultTimeoutMS, 25);
+ }, `failed to step up node ${conn.host}`, ReplSetTest.kDefaultTimeoutMS, kRetryIntervalMS);
// Waits for the primary to accept new writes.
- return rst.getPrimary(ReplSetTest.kDefaultTimeoutMS, 25);
+ return rst.getPrimary(ReplSetTest.kDefaultTimeoutMS, kRetryIntervalMS);
}
function oplogTop(conn) {
@@ -320,41 +327,45 @@ function RollbackTest(name = "RollbackTest", replSet) {
// suites, as it is possible in rare cases for the sync source to lose the entry
// corresponding to the optime the rollback node chose as its minValid.
log(`Wait for ${curSecondary.host} to finish rollback`);
- assert.soonNoExcept(() => {
- try {
- log(`Wait for secondary ${curSecondary} and tiebreaker ${tiebreakerNode}`);
- rst.awaitSecondaryNodesForRollbackTest(
- awaitSecondaryNodesForRollbackTimeout,
- [curSecondary, tiebreakerNode],
- curSecondary /* connToCheckForUnrecoverableRollback */);
- } catch (e) {
- if (e.unrecoverableRollbackDetected) {
- log(`Detected unrecoverable rollback on ${curSecondary.host}. Ending test.`,
- true /* important */);
- TestData.skipCheckDBHashes = true;
- rst.stopSet();
- quit();
+ assert.soonNoExcept(
+ () => {
+ try {
+ log(`Wait for secondary ${curSecondary} and tiebreaker ${tiebreakerNode}`);
+ rst.awaitSecondaryNodesForRollbackTest(
+ awaitSecondaryNodesForRollbackTimeout,
+ [curSecondary, tiebreakerNode],
+ curSecondary /* connToCheckForUnrecoverableRollback */,
+ kRetryIntervalMS);
+ } catch (e) {
+ if (e.unrecoverableRollbackDetected) {
+ log(`Detected unrecoverable rollback on ${curSecondary.host}. Ending test.`,
+ true /* important */);
+ TestData.skipCheckDBHashes = true;
+ rst.stopSet();
+ quit();
+ }
+ // Re-throw the original exception in all other cases.
+ throw e;
}
- // Re-throw the original exception in all other cases.
- throw e;
- }
- let rbid = assert.commandWorked(curSecondary.adminCommand("replSetGetRBID")).rbid;
- assert(rbid > lastRBID,
- `Expected RBID to increment past ${lastRBID} on ${curSecondary.host}`);
+ let rbid = assert.commandWorked(curSecondary.adminCommand("replSetGetRBID")).rbid;
+ assert(rbid > lastRBID,
+ `Expected RBID to increment past ${lastRBID} on ${curSecondary.host}`);
- assert.eq(oplogTop(curPrimary), oplogTop(curSecondary));
-
- return true;
- });
+ assert.eq(oplogTop(curPrimary), oplogTop(curSecondary));
+ return true;
+ },
+ `Waiting for rollback to complete on ${curSecondary.host} failed`,
+ ReplSetTest.kDefaultTimeoutMS,
+ kRetryIntervalMS);
log(`Rollback on ${curSecondary.host} completed, reconnecting tiebreaker`, true);
tiebreakerNode.reconnect([curPrimary, curSecondary]);
// Allow replication temporarily so the following checks succeed.
restartServerReplication(tiebreakerNode);
- rst.awaitReplication(null, null, [curSecondary, tiebreakerNode]);
+ rst.awaitReplication(null, null, [curSecondary, tiebreakerNode], kRetryIntervalMS);
log(`awaitReplication completed`, true);
// We call transition to steady state ops after awaiting replication has finished,
@@ -373,7 +384,7 @@ function RollbackTest(name = "RollbackTest", replSet) {
// Now that awaitReplication and checkDataConsistency are done, stop replication again so
// tiebreakerNode is never part of w: majority writes, see comment at top.
- stopServerReplication(tiebreakerNode);
+ stopServerReplication(tiebreakerNode, kRetryIntervalMS);
log(`RollbackTest transition to ${curState} took ${(new Date() - start)} ms`);
return curPrimary;
};
@@ -496,8 +507,10 @@ function RollbackTest(name = "RollbackTest", replSet) {
log(`Reconnecting the secondary ${curSecondary.host} so it'll go into rollback`);
// Reconnect the rollback node to the current primary, which is the node we want to sync
// from. If we reconnect to both the current primary and the tiebreaker node, the rollback
- // node may choose the tiebreaker.
+ // node may choose the tiebreaker. Send out a new round of heartbeats immediately so that
+ // the rollback node can find a sync source quickly.
curSecondary.reconnect([curPrimary]);
+ assert.commandWorked(curSecondary.adminCommand({replSetTest: 1, restartHeartbeats: 1}));
log(`RollbackTest transition to ${curState} took ${(new Date() - start)} ms`);
return curPrimary;
@@ -566,6 +579,9 @@ function RollbackTest(name = "RollbackTest", replSet) {
// Fail-point will clear on restart so do post-start.
setFastGetMoreEnabled(rst.nodes[nodeId]);
+ // Make sync source selection faster.
+ setFailPoint(
+ rst.nodes[nodeId], "forceBgSyncSyncSourceRetryWaitMS", {sleepMS: kRetryIntervalMS});
// Step up if the restarted node is the current primary.
if (rst.getNodeId(curPrimary) === nodeId) {
diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp
index 02cb1daff36..a81b0ebbb11 100644
--- a/src/mongo/db/repl/bgsync.cpp
+++ b/src/mongo/db/repl/bgsync.cpp
@@ -265,7 +265,7 @@ void BackgroundSync::_produce() {
LOGV2(21079,
"bgsync - stopReplProducer fail point "
"enabled. Blocking until fail point is disabled.");
- mongo::sleepsecs(1);
+ mongo::sleepmillis(_getRetrySleepMS());
return;
}
@@ -407,10 +407,7 @@ void BackgroundSync::_produce() {
// out of date. In that case we sleep for 1 second to reduce the amount we spin waiting
// for our map to update.
if (oldSource == source) {
- long long sleepMS = 1000;
- forceBgSyncSyncSourceRetryWaitMS.execute(
- [&](const BSONObj& data) { sleepMS = data["sleepMS"].numberInt(); });
-
+ long long sleepMS = _getRetrySleepMS();
LOGV2(21087,
"Chose same sync source candidate as last time, {syncSource}. Sleeping for "
"{sleepDurationMillis}ms to avoid immediately choosing a new sync source for the "
@@ -439,10 +436,7 @@ void BackgroundSync::_produce() {
"error"_attr = syncSourceResp.syncSourceStatus.getStatus());
}
- long long sleepMS = 1000;
- forceBgSyncSyncSourceRetryWaitMS.execute(
- [&](const BSONObj& data) { sleepMS = data["sleepMS"].numberInt(); });
-
+ long long sleepMS = _getRetrySleepMS();
// No sync source found.
LOGV2_DEBUG(21090,
1,
@@ -962,6 +956,12 @@ void BackgroundSync::startProducerIfStopped() {
}
}
+long long BackgroundSync::_getRetrySleepMS() {
+ long long sleepMS = 1000;
+ forceBgSyncSyncSourceRetryWaitMS.execute(
+ [&](const BSONObj& data) { sleepMS = data["sleepMS"].numberInt(); });
+ return sleepMS;
+}
} // namespace repl
} // namespace mongo
diff --git a/src/mongo/db/repl/bgsync.h b/src/mongo/db/repl/bgsync.h
index 5e7558fc7bd..3502597a179 100644
--- a/src/mongo/db/repl/bgsync.h
+++ b/src/mongo/db/repl/bgsync.h
@@ -211,6 +211,8 @@ private:
OpTime _readLastAppliedOpTime(OperationContext* opCtx);
+ long long _getRetrySleepMS();
+
// This OplogApplier applies oplog entries fetched from the sync source.
OplogApplier* const _oplogApplier;
diff --git a/src/mongo/shell/check_log.js b/src/mongo/shell/check_log.js
index 82f0792a2b8..fcd70af061c 100644
--- a/src/mongo/shell/check_log.js
+++ b/src/mongo/shell/check_log.js
@@ -118,13 +118,16 @@ checkLog = (function() {
* Calls the 'getLog' function at regular intervals on the provided connection 'conn' until
* the provided 'msg' is found in the logs, or it times out. Throws an exception on timeout.
*/
- let contains = function(conn, msg, timeout = 5 * 60 * 1000) {
+ let contains = function(conn, msg, timeout = 5 * 60 * 1000, retryIntervalMS = 300) {
// Don't run the hang analyzer because we don't expect contains() to always succeed.
- assert.soon(function() {
- return checkContainsOnce(conn, msg);
- }, 'Could not find log entries containing the following message: ' + msg, timeout, 300, {
- runHangAnalyzer: false
- });
+ assert.soon(
+ function() {
+ return checkContainsOnce(conn, msg);
+ },
+ 'Could not find log entries containing the following message: ' + msg,
+ timeout,
+ retryIntervalMS,
+ {runHangAnalyzer: false});
};
let containsJson = function(conn, id, attrsDict, timeout = 5 * 60 * 1000) {
diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js
index a2930b60d27..f08a51e239a 100644
--- a/src/mongo/shell/replsettest.js
+++ b/src/mongo/shell/replsettest.js
@@ -682,9 +682,10 @@ var ReplSetTest = function(opts) {
* if it throws.
*/
this.awaitSecondaryNodesForRollbackTest = function(
- timeout, slaves, connToCheckForUnrecoverableRollback) {
+ timeout, slaves, connToCheckForUnrecoverableRollback, retryIntervalMS) {
+ retryIntervalMS = retryIntervalMS || 200;
try {
- this.awaitSecondaryNodes(timeout, slaves);
+ this.awaitSecondaryNodes(timeout, slaves, retryIntervalMS);
} catch (originalEx) {
// There is a special case where we expect the (rare) possibility of unrecoverable
// rollbacks with EMRC:false in rollback suites with unclean shutdowns.
@@ -1679,12 +1680,13 @@ var ReplSetTest = function(opts) {
// Wait until the optime of the specified type reaches the primary's last applied optime. Blocks
// on all secondary nodes or just 'slaves', if specified. The timeout will reset if any of the
// secondaries makes progress.
- this.awaitReplication = function(timeout, secondaryOpTimeType, slaves) {
+ this.awaitReplication = function(timeout, secondaryOpTimeType, slaves, retryIntervalMS) {
if (slaves !== undefined && slaves !== self._slaves) {
print("ReplSetTest awaitReplication: going to check only " + slaves.map(s => s.host));
}
timeout = timeout || self.kDefaultTimeoutMS;
+ retryIntervalMS = retryIntervalMS || 200;
secondaryOpTimeType = secondaryOpTimeType || ReplSetTest.OpTimeType.LAST_APPLIED;
@@ -1878,7 +1880,7 @@ var ReplSetTest = function(opts) {
return false;
}
- }, "awaiting replication", timeout);
+ }, "awaitReplication timed out", timeout, retryIntervalMS);
}
};