diff options
13 files changed, 93 insertions, 61 deletions
diff --git a/jstests/multiVersion/genericSetFCVUsage/downgrade_after_rollback_via_refetch.js b/jstests/multiVersion/genericSetFCVUsage/downgrade_after_rollback_via_refetch.js index d3ffe4a3b48..9f48b3a82a0 100644 --- a/jstests/multiVersion/genericSetFCVUsage/downgrade_after_rollback_via_refetch.js +++ b/jstests/multiVersion/genericSetFCVUsage/downgrade_after_rollback_via_refetch.js @@ -23,7 +23,7 @@ function testDowngrade(enableMajorityReadConcern) { let config = replTest.getReplSetConfig(); config.members[2].priority = 0; config.settings = {chainingAllowed: false}; - replTest.initiate(config); + replTest.initiateWithHighElectionTimeout(config); let rollbackTest = new RollbackTest(name, replTest); // Set the featureCompatibilityVersion to the last-stable version, so that we can downgrade diff --git a/jstests/multiVersion/libs/multiversion_rollback.js b/jstests/multiVersion/libs/multiversion_rollback.js index 66db2114ad9..9263052a956 100644 --- a/jstests/multiVersion/libs/multiversion_rollback.js +++ b/jstests/multiVersion/libs/multiversion_rollback.js @@ -114,7 +114,7 @@ function setupReplicaSet(testName, rollbackNodeVersion, syncSourceVersion) { var rst = new ReplSetTest( {name: testName, nodes: initialNodes, useBridge: true, settings: {chainingAllowed: false}}); rst.startSet(); - rst.initiate(); + rst.initiateWithHighElectionTimeout(); // Wait for both nodes to be up. waitForState(rst.nodes[0], ReplSetTest.State.PRIMARY); diff --git a/jstests/noPassthrough/rollback_wt_cache_full.js b/jstests/noPassthrough/rollback_wt_cache_full.js index f7733a0110b..c2c77bad07f 100644 --- a/jstests/noPassthrough/rollback_wt_cache_full.js +++ b/jstests/noPassthrough/rollback_wt_cache_full.js @@ -32,7 +32,7 @@ config.members[2].priority = 0; config.settings = { chainingAllowed: false }; -rst.initiate(config); +rst.initiateWithHighElectionTimeout(config); // Prior to 4.0, rollback imposed a 300 MB limit on the total size of documents to refetch from // the sync source. Therefore, we select values for numDocs and minDocSizeMB, while accounting diff --git a/jstests/replsets/avg_num_catchup_ops.js b/jstests/replsets/avg_num_catchup_ops.js index 3b194e700ab..5a3bd488722 100644 --- a/jstests/replsets/avg_num_catchup_ops.js +++ b/jstests/replsets/avg_num_catchup_ops.js @@ -10,14 +10,11 @@ load("jstests/replsets/libs/election_metrics.js"); load("jstests/replsets/rslib.js"); const name = jsTestName(); -const rst = new ReplSetTest({name: name, nodes: 3, useBridge: true}); +const rst = new ReplSetTest( + {name: name, nodes: 3, useBridge: true, settings: {catchUpTimeoutMillis: 4 * 60 * 1000}}); rst.startSet(); -const confSettings = { - catchUpTimeoutMillis: 4 * 60 * 1000, -}; - -rst.initiateWithHighElectionTimeout(confSettings); +rst.initiateWithHighElectionTimeout(); rst.awaitSecondaryNodes(); rst.awaitReplication(); diff --git a/jstests/replsets/change_stream_speculative_majority_rollback.js b/jstests/replsets/change_stream_speculative_majority_rollback.js index 3fad115cadd..e53b65ade88 100644 --- a/jstests/replsets/change_stream_speculative_majority_rollback.js +++ b/jstests/replsets/change_stream_speculative_majority_rollback.js @@ -27,7 +27,7 @@ const replTest = new ReplSetTest({ replTest.startSet(); let config = replTest.getReplSetConfig(); config.members[2].priority = 0; -replTest.initiate(config); +replTest.initiateWithHighElectionTimeout(config); const rollbackTest = new RollbackTest(name, replTest); const primary = rollbackTest.getPrimary(); diff --git a/jstests/replsets/libs/rollback_test.js b/jstests/replsets/libs/rollback_test.js index ba76c4885f8..eb37aa4f449 100644 --- a/jstests/replsets/libs/rollback_test.js +++ b/jstests/replsets/libs/rollback_test.js @@ -12,6 +12,28 @@ * 4. kSyncSourceOpsDuringRollback: apply operations on the sync source after rollback has begun. * 5. kSteadyStateOps: (same as stage 1) with the option of waiting for the rollback to finish. * + * -------------------------------------------------- + * | STATE TRANSITION | NETWORK TOPOLOGY | + * |------------------------------------------------- + * | kSteadyStateOps | T | + * | | / \ | + * | | P1 - S | + * |-----------------------------|------------------| + * | kRollbackOps | T | + * | | / | + * | | P1 S | + * |-----------------------------|------------------| + * | kSyncSourceOpsBeforeRollback| T | + * | | \ | + * | | P1 P2 | + * |-----------------------------|------------------| + * | kSyncSourceOpsDuringRollback| T | + * | | \ | + * | | R - P2 | + * |------------------------------------------------- + * Note: 'T' refers to tiebreaker node, 'S' refers to secondary, 'P[n]' refers to primary in + * nth term and 'R' refers to rollback node. + * * Please refer to the various `transition*` functions for more information on the behavior * of each stage. */ @@ -31,6 +53,8 @@ load("jstests/hooks/validate_collections.js"); * must be configured with priority: 0 so that it won't be elected primary. Throughout * this file, this secondary will be referred to as the tiebreaker node. * 2. It must be running with mongobridge. + * 3. Must initiate the replset with high election timeout to avoid unplanned elections in the + * rollback test. * * If the caller does not provide their own replica set, a standard three-node * replset will be initialized instead, with all nodes running the latest version. @@ -112,6 +136,12 @@ function RollbackTest(name = "RollbackTest", replSet) { false, "Must set up ReplSetTest with chaining disabled."); + // Make sure electionTimeoutMillis is set to high value to avoid unplanned elections in + // the rollback test. + assert.gte(config.settings.electionTimeoutMillis, + ReplSetTest.kForeverMillis, + "Must initiate the replset with high election timeout"); + // Make sure the primary is not a priority: 0 node. assert.neq(0, config.members[0].priority); assert.eq(config.members[0].host, curPrimary.host); @@ -160,7 +190,7 @@ function RollbackTest(name = "RollbackTest", replSet) { let config = replSet.getReplSetConfig(); config.members[2].priority = 0; config.settings = {chainingAllowed: false}; - replSet.initiate(config); + replSet.initiateWithHighElectionTimeout(config); assert.eq(replSet.nodes.length, kNumDataBearingNodes, @@ -214,6 +244,17 @@ function RollbackTest(name = "RollbackTest", replSet) { } } + function stepUp(conn) { + log(`Waiting for the new primary ${conn.host} to be elected`); + assert.soonNoExcept(() => { + const res = conn.adminCommand({replSetStepUp: 1}); + return res.ok; + }); + + // Waits for the primary to accept new writes. + return rst.getPrimary(); + } + /** * Add a node to the ReplSetTest. It must be a non-voting node. If reInitiate is true, * also run ReplSetTest.reInitiate to configure the replset to include the new node. @@ -232,10 +273,10 @@ function RollbackTest(name = "RollbackTest", replSet) { * be replicated to all nodes and should not be rolled back. */ this.transitionToSteadyStateOperations = function({skipDataConsistencyChecks = false} = {}) { - // If we shut down the primary before the secondary begins rolling back against it, then - // the secondary may get elected and not actually roll back. In that case we do not check - // the RBID and just await replication. - if (!TestData.rollbackShutdowns) { + const isMajorityReadConcernEnabledOnRollbackNode = + assert.commandWorked(curSecondary.adminCommand({serverStatus: 1})) + .storageEngine.supportsCommittedReads; + if (isMajorityReadConcernEnabledOnRollbackNode) { log(`Waiting for rollback to complete on ${curSecondary.host}`, true); let rbid = -1; assert.soon(() => { @@ -252,6 +293,8 @@ function RollbackTest(name = "RollbackTest", replSet) { return rbid === lastRBID + 1; }, "Timed out waiting for RBID to increment on " + curSecondary.host); } else { + // TODO: After fixing SERVER-45178, we can remove the else block as we are guaranteed + // that the rollback id will get updated if the rollback has happened on that node. log(`Skipping RBID check on ${curSecondary.host} because shutdowns ` + `may prevent a rollback here.`); } @@ -287,9 +330,6 @@ function RollbackTest(name = "RollbackTest", replSet) { log(`Rollback on ${curSecondary.host} (if needed) and awaitReplication completed`, true); - // Unfreeze the node if it was previously frozen, so that it can run for the election. - assert.commandWorked(curSecondary.adminCommand({replSetFreeze: 0})); - // We call transition to steady state ops after awaiting replication has finished, // otherwise it could be confusing to see operations being replicated when we're already // in rollback complete state. @@ -375,13 +415,7 @@ function RollbackTest(name = "RollbackTest", replSet) { elected`); curSecondary.reconnect([tiebreakerNode]); - log(`Waiting for the new primary ${curSecondary.host} to be elected`); - assert.soonNoExcept(() => { - const res = curSecondary.adminCommand({replSetStepUp: 1}); - return res.ok; - }); - - const newPrimary = rst.getPrimary(); + const newPrimary = stepUp(curSecondary); // As a sanity check, ensure the new primary is the old secondary. The opposite scenario // should never be possible with 2 electable nodes and the sequence of operations thus far. @@ -393,6 +427,16 @@ function RollbackTest(name = "RollbackTest", replSet) { curSecondary = curPrimary; curPrimary = newPrimary; + // To ensure rollback won't be skipped for shutdowns, wait till the no-op oplog + // entry ("new primary") written in the new term gets persisted in the disk. + // Note: rollbackShutdowns are not allowed for in-memory/ephemeral storage engines. + if (TestData.rollbackShutdowns) { + const dbName = "TermGetsPersisted"; + assert.commandWorked(curPrimary.getDB(dbName).ensureRollback.insert( + {thisDocument: 'is inserted to ensure rollback is not skipped'}, + {writeConcern: {w: 1, j: true}})); + } + lastRBID = assert.commandWorked(curSecondary.adminCommand("replSetGetRBID")).rbid; // The current primary, which is the old secondary, will later become the sync source. @@ -411,18 +455,6 @@ function RollbackTest(name = "RollbackTest", replSet) { this.transitionToSyncSourceOperationsDuringRollback = function() { transitionIfAllowed(State.kSyncSourceOpsDuringRollback); - // If the rollback node was restarted, make sure it has finished restarting and become a - // secondary again. Otherwise, the subsequent 'replSetFreeze' command could fail with - // NotYetInitialized if the node is still in the process of restarting (e.g. not yet loaded - // the local config or reached the STARTUP2 state). - waitForState(curSecondary, ReplSetTest.State.SECONDARY); - - // If the nodes are restarted after the rollback node is able to rollback successfully and - // catch up to curPrimary's oplog, then the rollback node can become the new primary. - // If so, it can lead to unplanned state transitions, like unconditional step down, during - // the test. To avoid those problems, prevent rollback node from starting an election. - assert.commandWorked(curSecondary.adminCommand({replSetFreeze: ReplSetTest.kForeverSecs})); - log(`Reconnecting the secondary ${curSecondary.host} so it'll go into rollback`); // Reconnect the rollback node to the current primary, which is the node we want to sync // from. If we reconnect to both the current primary and the tiebreaker node, the rollback @@ -487,20 +519,21 @@ function RollbackTest(name = "RollbackTest", replSet) { log(`Restarting node ${hostName}`); rst.start(nodeId, startOptions, true /* restart */); - // Freeze the node if the restarted node is the rollback node. - if (curState === State.kSyncSourceOpsDuringRollback && - rst.getNodeId(curSecondary) === nodeId) { - rst.freeze(nodeId); - } - - const oldPrimary = curPrimary; - // Wait for the new primary to be elected and ready to take operations before continuing. - curPrimary = rst.getPrimary(); - - // The primary can change after node restarts only if all the 3 nodes are connected to each - // other. - if (curState !== State.kSteadyStateOps) { - assert.eq(curPrimary, oldPrimary); + // Step up if the restarted node is the current primary. + if (rst.getNodeId(curPrimary) === nodeId) { + // To prevent below step up from being flaky, we step down and freeze the + // current secondary to prevent starting a new election. The current secondary + // can start running election due to explicit step up by the shutting down of current + // primary if the server parameter "enableElectionHandoff" is set to true. + rst.freeze(curSecondary); + + const newPrimary = stepUp(curPrimary); + // As a sanity check, ensure the new primary is the current primary. This is true, + // because we have configured the replica set with high electionTimeoutMillis. + assert.eq(newPrimary, curPrimary, "Did not elect the same node as primary"); + + // Unfreeze the current secondary so that it can step up again. + assert.commandWorked(curSecondary.adminCommand({replSetFreeze: 0})); } curSecondary = rst.getSecondary(); diff --git a/jstests/replsets/rollback_after_disabling_majority_reads.js b/jstests/replsets/rollback_after_disabling_majority_reads.js index e8b2eeeebba..f1154068ee5 100644 --- a/jstests/replsets/rollback_after_disabling_majority_reads.js +++ b/jstests/replsets/rollback_after_disabling_majority_reads.js @@ -21,7 +21,7 @@ config.members[2].priority = 0; config.settings = { chainingAllowed: false }; -replTest.initiate(config); +replTest.initiateWithHighElectionTimeout(config); const rollbackTest = new RollbackTest(name, replTest); const rollbackNode = rollbackTest.transitionToRollbackOperations(); diff --git a/jstests/replsets/rollback_after_enabling_majority_reads.js b/jstests/replsets/rollback_after_enabling_majority_reads.js index 85093baa51f..f7c346aca9e 100644 --- a/jstests/replsets/rollback_after_enabling_majority_reads.js +++ b/jstests/replsets/rollback_after_enabling_majority_reads.js @@ -27,7 +27,7 @@ config.members[2].priority = 0; config.settings = { chainingAllowed: false }; -replTest.initiate(config); +replTest.initiateWithHighElectionTimeout(config); let rollbackTest = new RollbackTest(name, replTest); jsTest.log("Ensure the stable timestamp is ahead of the common point on the rollback node."); diff --git a/jstests/replsets/rollback_via_refetch_commit_transaction.js b/jstests/replsets/rollback_via_refetch_commit_transaction.js index 317fc7b97f8..35e21b25bd1 100644 --- a/jstests/replsets/rollback_via_refetch_commit_transaction.js +++ b/jstests/replsets/rollback_via_refetch_commit_transaction.js @@ -31,7 +31,7 @@ config.members[2].priority = 0; config.settings = { chainingAllowed: false }; -rst.initiate(config); +rst.initiateWithHighElectionTimeout(config); const primaryNode = rst.getPrimary(); diff --git a/jstests/replsets/rollback_via_refetch_survives_nonexistent_collection_drop.js b/jstests/replsets/rollback_via_refetch_survives_nonexistent_collection_drop.js index acb8bea802b..bd229838a62 100644 --- a/jstests/replsets/rollback_via_refetch_survives_nonexistent_collection_drop.js +++ b/jstests/replsets/rollback_via_refetch_survives_nonexistent_collection_drop.js @@ -26,7 +26,7 @@ config.members[2].priority = 0; config.settings = { chainingAllowed: false }; -rst.initiate(config); +rst.initiateWithHighElectionTimeout(config); const rollbackTest = new RollbackTest(collName, rst); diff --git a/jstests/replsets/transactions_after_rollback_via_refetch.js b/jstests/replsets/transactions_after_rollback_via_refetch.js index 463d70f4489..80ef4a8ded9 100644 --- a/jstests/replsets/transactions_after_rollback_via_refetch.js +++ b/jstests/replsets/transactions_after_rollback_via_refetch.js @@ -61,7 +61,7 @@ let replTest = new ReplSetTest({ replTest.startSet(); let config = replTest.getReplSetConfig(); config.members[2].priority = 0; -replTest.initiate(config); +replTest.initiateWithHighElectionTimeout(config); let rollbackTest = new RollbackTest(name, replTest); diff --git a/jstests/replsets/unrecoverable_rollback_early_exit.js b/jstests/replsets/unrecoverable_rollback_early_exit.js index 96428a719a7..c709820eaa6 100644 --- a/jstests/replsets/unrecoverable_rollback_early_exit.js +++ b/jstests/replsets/unrecoverable_rollback_early_exit.js @@ -30,7 +30,7 @@ const rst = new ReplSetTest({ nodeOptions: {enableMajorityReadConcern: "false"} }); rst.startSet(); -rst.initiate(); +rst.initiateWithHighElectionTimeout(); const rollbackTest = new RollbackTest(testName, rst); const rollbackNode = rollbackTest.transitionToRollbackOperations(); diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js index 1b3b9df19eb..ff2e713a695 100644 --- a/src/mongo/shell/replsettest.js +++ b/src/mongo/shell/replsettest.js @@ -1386,10 +1386,11 @@ var ReplSetTest = function(opts) { * Modifies the election timeout to be 24 hours so that no unplanned elections happen. Then * runs replSetInitiate on the replica set with the new config. */ - this.initiateWithHighElectionTimeout = function(opts = {}) { - let cfg = this.getReplSetConfig(); - cfg.settings = Object.assign(opts, {"electionTimeoutMillis": 24 * 60 * 60 * 1000}); - this.initiate(cfg); + this.initiateWithHighElectionTimeout = function(config) { + config = config || this.getReplSetConfig(); + config.settings = config.settings || {}; + config.settings["electionTimeoutMillis"] = ReplSetTest.kForeverMillis; + this.initiate(config); }; /** @@ -3197,6 +3198,7 @@ ReplSetTest.kDefaultTimeoutMS = 10 * 60 * 1000; * Global default number that's effectively infinite. */ ReplSetTest.kForeverSecs = 24 * 60 * 60; +ReplSetTest.kForeverMillis = ReplSetTest.kForeverSecs * 1000; /** * Set of states that the replica set can be in. Used for the wait functions. |