From a30cef00b9e3ccfe60a4dce7f9f2c7cbaf1e68b7 Mon Sep 17 00:00:00 2001 From: William Schultz Date: Mon, 27 Apr 2020 12:17:43 -0400 Subject: SERVER-44068 Speed up transition to 'kSteadyStateOps' in RollbackTest --- jstests/libs/write_concern_util.js | 8 +++- jstests/replsets/libs/rollback_test.js | 74 +++++++++++++++++++++------------- src/mongo/db/repl/bgsync.cpp | 18 ++++----- src/mongo/db/repl/bgsync.h | 2 + src/mongo/shell/check_log.js | 15 ++++--- src/mongo/shell/replsettest.js | 10 +++-- 6 files changed, 77 insertions(+), 50 deletions(-) diff --git a/jstests/libs/write_concern_util.js b/jstests/libs/write_concern_util.js index 90832a858d8..dfbae445d2e 100644 --- a/jstests/libs/write_concern_util.js +++ b/jstests/libs/write_concern_util.js @@ -18,7 +18,8 @@ function shardCollectionWithChunks(st, coll, numDocs) { } // Stops replication on the given server(s). -function stopServerReplication(conn) { +function stopServerReplication(conn, retryIntervalMS) { + retryIntervalMS = retryIntervalMS || 300; if (conn.length) { conn.forEach(function(n) { stopServerReplication(n); @@ -37,7 +38,10 @@ function stopServerReplication(conn) { // the fail point won't be hit until the node transitions from being the primary. if (assert.commandWorked(conn.adminCommand('replSetGetStatus')).myState != ReplSetTest.State.PRIMARY) { - checkLog.contains(conn, 'bgsync - stopReplProducer fail point enabled'); + checkLog.contains(conn, + 'bgsync - stopReplProducer fail point enabled', + ReplSetTest.kDefaultTimeoutMS, + retryIntervalMS); } } diff --git a/jstests/replsets/libs/rollback_test.js b/jstests/replsets/libs/rollback_test.js index 75285ee1714..18f51de84e9 100644 --- a/jstests/replsets/libs/rollback_test.js +++ b/jstests/replsets/libs/rollback_test.js @@ -43,6 +43,7 @@ load("jstests/replsets/rslib.js"); load("jstests/replsets/libs/two_phase_drops.js"); load("jstests/hooks/validate_collections.js"); +load('jstests/libs/fail_point_util.js'); /** * @@ -90,6 +91,7 @@ function RollbackTest(name = "RollbackTest", replSet) { const SIGTERM = 15; const kNumDataBearingNodes = 3; const kElectableNodes = 2; + const kRetryIntervalMS = 25; let awaitSecondaryNodesForRollbackTimeout; @@ -155,6 +157,11 @@ function RollbackTest(name = "RollbackTest", replSet) { waitForState(curSecondary, ReplSetTest.State.SECONDARY); waitForState(tiebreakerNode, ReplSetTest.State.SECONDARY); + // Make sync source selection faster. + replSet.nodes.forEach((node) => { + setFailPoint(node, "forceBgSyncSyncSourceRetryWaitMS", {sleepMS: kRetryIntervalMS}); + }); + rst = replSet; lastRBID = assert.commandWorked(curSecondary.adminCommand("replSetGetRBID")).rbid; @@ -271,10 +278,10 @@ function RollbackTest(name = "RollbackTest", replSet) { assert.soonNoExcept(() => { const res = conn.adminCommand({replSetStepUp: 1}); return res.ok; - }, `failed to step up node ${conn.host}`, ReplSetTest.kDefaultTimeoutMS, 25); + }, `failed to step up node ${conn.host}`, ReplSetTest.kDefaultTimeoutMS, kRetryIntervalMS); // Waits for the primary to accept new writes. - return rst.getPrimary(ReplSetTest.kDefaultTimeoutMS, 25); + return rst.getPrimary(ReplSetTest.kDefaultTimeoutMS, kRetryIntervalMS); } function oplogTop(conn) { @@ -320,41 +327,45 @@ function RollbackTest(name = "RollbackTest", replSet) { // suites, as it is possible in rare cases for the sync source to lose the entry // corresponding to the optime the rollback node chose as its minValid. log(`Wait for ${curSecondary.host} to finish rollback`); - assert.soonNoExcept(() => { - try { - log(`Wait for secondary ${curSecondary} and tiebreaker ${tiebreakerNode}`); - rst.awaitSecondaryNodesForRollbackTest( - awaitSecondaryNodesForRollbackTimeout, - [curSecondary, tiebreakerNode], - curSecondary /* connToCheckForUnrecoverableRollback */); - } catch (e) { - if (e.unrecoverableRollbackDetected) { - log(`Detected unrecoverable rollback on ${curSecondary.host}. Ending test.`, - true /* important */); - TestData.skipCheckDBHashes = true; - rst.stopSet(); - quit(); + assert.soonNoExcept( + () => { + try { + log(`Wait for secondary ${curSecondary} and tiebreaker ${tiebreakerNode}`); + rst.awaitSecondaryNodesForRollbackTest( + awaitSecondaryNodesForRollbackTimeout, + [curSecondary, tiebreakerNode], + curSecondary /* connToCheckForUnrecoverableRollback */, + kRetryIntervalMS); + } catch (e) { + if (e.unrecoverableRollbackDetected) { + log(`Detected unrecoverable rollback on ${curSecondary.host}. Ending test.`, + true /* important */); + TestData.skipCheckDBHashes = true; + rst.stopSet(); + quit(); + } + // Re-throw the original exception in all other cases. + throw e; } - // Re-throw the original exception in all other cases. - throw e; - } - let rbid = assert.commandWorked(curSecondary.adminCommand("replSetGetRBID")).rbid; - assert(rbid > lastRBID, - `Expected RBID to increment past ${lastRBID} on ${curSecondary.host}`); + let rbid = assert.commandWorked(curSecondary.adminCommand("replSetGetRBID")).rbid; + assert(rbid > lastRBID, + `Expected RBID to increment past ${lastRBID} on ${curSecondary.host}`); - assert.eq(oplogTop(curPrimary), oplogTop(curSecondary)); - - return true; - }); + assert.eq(oplogTop(curPrimary), oplogTop(curSecondary)); + return true; + }, + `Waiting for rollback to complete on ${curSecondary.host} failed`, + ReplSetTest.kDefaultTimeoutMS, + kRetryIntervalMS); log(`Rollback on ${curSecondary.host} completed, reconnecting tiebreaker`, true); tiebreakerNode.reconnect([curPrimary, curSecondary]); // Allow replication temporarily so the following checks succeed. restartServerReplication(tiebreakerNode); - rst.awaitReplication(null, null, [curSecondary, tiebreakerNode]); + rst.awaitReplication(null, null, [curSecondary, tiebreakerNode], kRetryIntervalMS); log(`awaitReplication completed`, true); // We call transition to steady state ops after awaiting replication has finished, @@ -373,7 +384,7 @@ function RollbackTest(name = "RollbackTest", replSet) { // Now that awaitReplication and checkDataConsistency are done, stop replication again so // tiebreakerNode is never part of w: majority writes, see comment at top. - stopServerReplication(tiebreakerNode); + stopServerReplication(tiebreakerNode, kRetryIntervalMS); log(`RollbackTest transition to ${curState} took ${(new Date() - start)} ms`); return curPrimary; }; @@ -496,8 +507,10 @@ function RollbackTest(name = "RollbackTest", replSet) { log(`Reconnecting the secondary ${curSecondary.host} so it'll go into rollback`); // Reconnect the rollback node to the current primary, which is the node we want to sync // from. If we reconnect to both the current primary and the tiebreaker node, the rollback - // node may choose the tiebreaker. + // node may choose the tiebreaker. Send out a new round of heartbeats immediately so that + // the rollback node can find a sync source quickly. curSecondary.reconnect([curPrimary]); + assert.commandWorked(curSecondary.adminCommand({replSetTest: 1, restartHeartbeats: 1})); log(`RollbackTest transition to ${curState} took ${(new Date() - start)} ms`); return curPrimary; @@ -566,6 +579,9 @@ function RollbackTest(name = "RollbackTest", replSet) { // Fail-point will clear on restart so do post-start. setFastGetMoreEnabled(rst.nodes[nodeId]); + // Make sync source selection faster. + setFailPoint( + rst.nodes[nodeId], "forceBgSyncSyncSourceRetryWaitMS", {sleepMS: kRetryIntervalMS}); // Step up if the restarted node is the current primary. if (rst.getNodeId(curPrimary) === nodeId) { diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp index 02cb1daff36..a81b0ebbb11 100644 --- a/src/mongo/db/repl/bgsync.cpp +++ b/src/mongo/db/repl/bgsync.cpp @@ -265,7 +265,7 @@ void BackgroundSync::_produce() { LOGV2(21079, "bgsync - stopReplProducer fail point " "enabled. Blocking until fail point is disabled."); - mongo::sleepsecs(1); + mongo::sleepmillis(_getRetrySleepMS()); return; } @@ -407,10 +407,7 @@ void BackgroundSync::_produce() { // out of date. In that case we sleep for 1 second to reduce the amount we spin waiting // for our map to update. if (oldSource == source) { - long long sleepMS = 1000; - forceBgSyncSyncSourceRetryWaitMS.execute( - [&](const BSONObj& data) { sleepMS = data["sleepMS"].numberInt(); }); - + long long sleepMS = _getRetrySleepMS(); LOGV2(21087, "Chose same sync source candidate as last time, {syncSource}. Sleeping for " "{sleepDurationMillis}ms to avoid immediately choosing a new sync source for the " @@ -439,10 +436,7 @@ void BackgroundSync::_produce() { "error"_attr = syncSourceResp.syncSourceStatus.getStatus()); } - long long sleepMS = 1000; - forceBgSyncSyncSourceRetryWaitMS.execute( - [&](const BSONObj& data) { sleepMS = data["sleepMS"].numberInt(); }); - + long long sleepMS = _getRetrySleepMS(); // No sync source found. LOGV2_DEBUG(21090, 1, @@ -962,6 +956,12 @@ void BackgroundSync::startProducerIfStopped() { } } +long long BackgroundSync::_getRetrySleepMS() { + long long sleepMS = 1000; + forceBgSyncSyncSourceRetryWaitMS.execute( + [&](const BSONObj& data) { sleepMS = data["sleepMS"].numberInt(); }); + return sleepMS; +} } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/bgsync.h b/src/mongo/db/repl/bgsync.h index 5e7558fc7bd..3502597a179 100644 --- a/src/mongo/db/repl/bgsync.h +++ b/src/mongo/db/repl/bgsync.h @@ -211,6 +211,8 @@ private: OpTime _readLastAppliedOpTime(OperationContext* opCtx); + long long _getRetrySleepMS(); + // This OplogApplier applies oplog entries fetched from the sync source. OplogApplier* const _oplogApplier; diff --git a/src/mongo/shell/check_log.js b/src/mongo/shell/check_log.js index 82f0792a2b8..fcd70af061c 100644 --- a/src/mongo/shell/check_log.js +++ b/src/mongo/shell/check_log.js @@ -118,13 +118,16 @@ checkLog = (function() { * Calls the 'getLog' function at regular intervals on the provided connection 'conn' until * the provided 'msg' is found in the logs, or it times out. Throws an exception on timeout. */ - let contains = function(conn, msg, timeout = 5 * 60 * 1000) { + let contains = function(conn, msg, timeout = 5 * 60 * 1000, retryIntervalMS = 300) { // Don't run the hang analyzer because we don't expect contains() to always succeed. - assert.soon(function() { - return checkContainsOnce(conn, msg); - }, 'Could not find log entries containing the following message: ' + msg, timeout, 300, { - runHangAnalyzer: false - }); + assert.soon( + function() { + return checkContainsOnce(conn, msg); + }, + 'Could not find log entries containing the following message: ' + msg, + timeout, + retryIntervalMS, + {runHangAnalyzer: false}); }; let containsJson = function(conn, id, attrsDict, timeout = 5 * 60 * 1000) { diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js index a2930b60d27..f08a51e239a 100644 --- a/src/mongo/shell/replsettest.js +++ b/src/mongo/shell/replsettest.js @@ -682,9 +682,10 @@ var ReplSetTest = function(opts) { * if it throws. */ this.awaitSecondaryNodesForRollbackTest = function( - timeout, slaves, connToCheckForUnrecoverableRollback) { + timeout, slaves, connToCheckForUnrecoverableRollback, retryIntervalMS) { + retryIntervalMS = retryIntervalMS || 200; try { - this.awaitSecondaryNodes(timeout, slaves); + this.awaitSecondaryNodes(timeout, slaves, retryIntervalMS); } catch (originalEx) { // There is a special case where we expect the (rare) possibility of unrecoverable // rollbacks with EMRC:false in rollback suites with unclean shutdowns. @@ -1679,12 +1680,13 @@ var ReplSetTest = function(opts) { // Wait until the optime of the specified type reaches the primary's last applied optime. Blocks // on all secondary nodes or just 'slaves', if specified. The timeout will reset if any of the // secondaries makes progress. - this.awaitReplication = function(timeout, secondaryOpTimeType, slaves) { + this.awaitReplication = function(timeout, secondaryOpTimeType, slaves, retryIntervalMS) { if (slaves !== undefined && slaves !== self._slaves) { print("ReplSetTest awaitReplication: going to check only " + slaves.map(s => s.host)); } timeout = timeout || self.kDefaultTimeoutMS; + retryIntervalMS = retryIntervalMS || 200; secondaryOpTimeType = secondaryOpTimeType || ReplSetTest.OpTimeType.LAST_APPLIED; @@ -1878,7 +1880,7 @@ var ReplSetTest = function(opts) { return false; } - }, "awaiting replication", timeout); + }, "awaitReplication timed out", timeout, retryIntervalMS); } }; -- cgit v1.2.1