diff options
author | Spencer T Brody <spencer@mongodb.com> | 2017-01-17 17:06:52 -0500 |
---|---|---|
committer | Spencer T Brody <spencer@mongodb.com> | 2017-01-23 16:18:50 -0500 |
commit | aca85db93035c35887f7ecf5a64f972bb0a2c82a (patch) | |
tree | 2deb4fe13f348a66e8905234482ffa4973f763e5 | |
parent | f0cf61dbcdc569e1b3642097787068c7b1139273 (diff) | |
download | mongo-aca85db93035c35887f7ecf5a64f972bb0a2c82a.tar.gz |
SERVER-27680 Merge stopOplogFetcher and pauseRsBgSyncProducer failpoint into single stopReplProducer failpoint
(cherry picked from commit 21948042b6da5fb5bf15897f9808a70551f5af09)
-rw-r--r-- | jstests/libs/check_log.js | 99 | ||||
-rw-r--r-- | jstests/libs/write_concern_util.js | 13 | ||||
-rw-r--r-- | jstests/replsets/catchup.js | 42 | ||||
-rw-r--r-- | jstests/replsets/read_committed_stale_history.js | 6 | ||||
-rw-r--r-- | jstests/replsets/server8070.js | 272 | ||||
-rw-r--r-- | jstests/sharding/config_version_rollback.js | 14 | ||||
-rw-r--r-- | jstests/sharding/shard_identity_rollback.js | 24 | ||||
-rw-r--r-- | src/mongo/db/repl/bgsync.cpp | 23 | ||||
-rw-r--r-- | src/mongo/db/repl/data_replicator.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/repl/oplog_fetcher.cpp | 10 | ||||
-rw-r--r-- | src/mongo/db/repl/oplog_fetcher.h | 2 |
11 files changed, 247 insertions, 260 deletions
diff --git a/jstests/libs/check_log.js b/jstests/libs/check_log.js index 8cb064f9558..5a9ba10435a 100644 --- a/jstests/libs/check_log.js +++ b/jstests/libs/check_log.js @@ -1,53 +1,64 @@ -"use strict"; - /* * Helper functions which connect to a server, and check its logs for particular strings. */ -var checkLog = (function() { - /* - * Calls the 'getLog' function at regular intervals on the provided connection 'conn' until the - * provided 'msg' is found in the logs, or 5 minutes have elapsed. Throws an exception on - * timeout. - */ - var contains = function(conn, msg) { - assert.soon( - function() { - var logMessages = assert.commandWorked(conn.adminCommand({getLog: 'global'})).log; - for (var i = 0; i < logMessages.length; i++) { - if (logMessages[i].indexOf(msg) != -1) { - return true; +var checkLog; + +(function() { + "use strict"; + + if (checkLog) { + return; // Protect against this file being double-loaded. + } + + checkLog = (function() { + /* + * Calls the 'getLog' function at regular intervals on the provided connection 'conn' until + * the provided 'msg' is found in the logs, or 5 minutes have elapsed. Throws an exception + * on timeout. + */ + var contains = function(conn, msg) { + assert.soon( + function() { + var logMessages = + assert.commandWorked(conn.adminCommand({getLog: 'global'})).log; + for (var i = 0; i < logMessages.length; i++) { + if (logMessages[i].indexOf(msg) != -1) { + return true; + } } - } - return false; - }, - 'Could not find log entries containing the following message: ' + msg, - 5 * 60 * 1000, - 300); - }; + return false; + }, + 'Could not find log entries containing the following message: ' + msg, + 5 * 60 * 1000, + 300); + }; - /* - * Calls the 'getLog' function at regular intervals on the provided connection 'conn' until the - * provided 'msg' is found in the logs exactly 'expectedCount' times, or 5 minutes have elapsed. - * Throws an exception on timeout. - */ - var containsWithCount = function(conn, msg, expectedCount) { - var count = 0; - assert.soon( - function() { - var logMessages = assert.commandWorked(conn.adminCommand({getLog: 'global'})).log; - for (var i = 0; i < logMessages.length; i++) { - if (logMessages[i].indexOf(msg) != -1) { - count++; + /* + * Calls the 'getLog' function at regular intervals on the provided connection 'conn' until + * the provided 'msg' is found in the logs exactly 'expectedCount' times, or 5 minutes have + * elapsed. + * Throws an exception on timeout. + */ + var containsWithCount = function(conn, msg, expectedCount) { + var count = 0; + assert.soon( + function() { + var logMessages = + assert.commandWorked(conn.adminCommand({getLog: 'global'})).log; + for (var i = 0; i < logMessages.length; i++) { + if (logMessages[i].indexOf(msg) != -1) { + count++; + } } - } - return expectedCount === count; - }, - 'Expected ' + expectedCount + ', but instead saw ' + count + - ' log entries containing the following message: ' + msg, - 5 * 60 * 1000, - 300); - }; + return expectedCount === count; + }, + 'Expected ' + expectedCount + ', but instead saw ' + count + + ' log entries containing the following message: ' + msg, + 5 * 60 * 1000, + 300); + }; - return {contains: contains, containsWithCount: containsWithCount}; + return {contains: contains, containsWithCount: containsWithCount}; + })(); })(); diff --git a/jstests/libs/write_concern_util.js b/jstests/libs/write_concern_util.js index d7541b3b9a6..0115fda8c4c 100644 --- a/jstests/libs/write_concern_util.js +++ b/jstests/libs/write_concern_util.js @@ -2,6 +2,8 @@ * Utilities for testing writeConcern. */ +load("jstests/libs/check_log.js"); + // Shards a collection and creates 2 chunks, one on each s of two shards. function shardCollectionWithChunks(st, coll) { var _db = coll.getDB(); @@ -25,10 +27,13 @@ function stopServerReplication(conn) { }); return; } - var errMsg = 'Failed to enable stopOplogFetcher failpoint.'; + var errMsg = 'Failed to enable stopReplProducer failpoint.'; assert.commandWorked( - conn.getDB('admin').runCommand({configureFailPoint: 'stopOplogFetcher', mode: 'alwaysOn'}), + conn.getDB('admin').runCommand({configureFailPoint: 'stopReplProducer', mode: 'alwaysOn'}), errMsg); + + // Wait until the fail point is actually hit. + checkLog.contains(conn, 'bgsync - stopReplProducer fail point enabled'); } // Stops replication at all replicaset secondaries. @@ -50,9 +55,9 @@ function restartServerReplication(conn) { return; } - var errMsg = 'Failed to disable stopOplogFetcher failpoint.'; + var errMsg = 'Failed to disable stopReplProducer failpoint.'; assert.commandWorked( - conn.getDB('admin').runCommand({configureFailPoint: 'stopOplogFetcher', mode: 'off'}), + conn.getDB('admin').runCommand({configureFailPoint: 'stopReplProducer', mode: 'off'}), errMsg); } diff --git a/jstests/replsets/catchup.js b/jstests/replsets/catchup.js index 8be2d20da93..8cb3ec68ae9 100644 --- a/jstests/replsets/catchup.js +++ b/jstests/replsets/catchup.js @@ -4,6 +4,7 @@ "use strict"; load("jstests/libs/check_log.js"); + load("jstests/libs/write_concern_util.js"); load("jstests/replsets/rslib.js"); var name = "catch_up"; @@ -33,31 +34,6 @@ node.adminCommand(verbosity); }); - function enableFailPoint(node) { - jsTest.log("enable failpoint " + node.host); - // Disable syncing on both secondaries. - assert.commandWorked( - node.adminCommand({configureFailPoint: 'pauseRsBgSyncProducer', mode: 'alwaysOn'}), - 'Failed to configure pauseRsBgSyncProducer failpoint.'); - assert.commandWorked( - node.adminCommand({configureFailPoint: 'stopOplogFetcher', mode: 'alwaysOn'})); - } - - function disableFailPoint(node) { - jsTest.log("disable failpoint " + node.host); - assert.commandWorked( - node.adminCommand({configureFailPoint: 'stopOplogFetcher', mode: 'off'})); - try { - assert.commandWorked( - node.adminCommand({configureFailPoint: 'pauseRsBgSyncProducer', mode: 'off'}), - 'Failed to disable pauseRsBgSyncProducer failpoint.'); - } catch (e) { - // Enable bgsync producer may cause rollback, which will close all connections - // including the one sending "configureFailPoint". - print("got exception when disabling fail point 'pauseRsBgSyncProducer': " + e); - } - } - function stepUp(node) { assert.commandWorked(node.adminCommand({replSetStepUp: 1})); return node; @@ -96,7 +72,7 @@ jsTest.log("Case 2: The primary needs to catch up, succeeds in time."); // Write documents that cannot be replicated to secondaries in time. var originalSecondaries = rst.getSecondaries(); - originalSecondaries.forEach(enableFailPoint); + stopServerReplication(originalSecondaries); doWrites(rst.getPrimary()); var latestOp = getLatestOp(rst.getPrimary()); // New primary wins immediately, but needs to catch up. @@ -105,7 +81,7 @@ // Check this node is not writable. assert.eq(newPrimary.getDB("test").isMaster().ismaster, false); // Disable fail point to allow replication. - originalSecondaries.forEach(disableFailPoint); + restartServerReplication(originalSecondaries); // getPrimary() blocks until the primary finishes drain mode. assert.eq(newPrimary, rst.getPrimary()); // Wait for all secondaries to catch up @@ -116,7 +92,7 @@ jsTest.log("Case 3: The primary needs to catch up, but has to change sync source to catch up."); // Write documents that cannot be replicated to secondaries in time. - rst.getSecondaries().forEach(enableFailPoint); + stopServerReplication(rst.getSecondaries()); doWrites(rst.getPrimary()); var oldPrimary = rst.getPrimary(); originalSecondaries = rst.getSecondaries(); @@ -125,13 +101,13 @@ rst.awaitNodesAgreeOnPrimary(); // Disable fail point on one of the other secondaries. // Wait until it catches up with the old primary. - disableFailPoint(originalSecondaries[1]); + restartServerReplication(originalSecondaries[1]); assert.commandWorked(originalSecondaries[1].adminCommand({replSetSyncFrom: oldPrimary.host})); awaitOpTime(originalSecondaries[1], latestOp.ts); // Disconnect the new primary and the old one. oldPrimary.disconnect(newPrimary); // Disable the failpoint, the new primary should sync from the other secondary. - disableFailPoint(newPrimary); + restartServerReplication(newPrimary); assert.eq(newPrimary, rst.getPrimary()); checkOpInOplog(newPrimary, latestOp, 1); // Restore the broken connection @@ -148,7 +124,7 @@ // Write documents that cannot be replicated to secondaries in time. originalSecondaries = rst.getSecondaries(); - originalSecondaries.forEach(enableFailPoint); + stopServerReplication(originalSecondaries); doWrites(rst.getPrimary()); latestOp = getLatestOp(rst.getPrimary()); @@ -158,7 +134,7 @@ var latestOpOnNewPrimary = getLatestOp(newPrimary); // Wait until the new primary completes the transition to primary and writes a no-op. checkLog.contains(newPrimary, "Cannot catch up oplog after becoming primary"); - disableFailPoint(newPrimary); + restartServerReplication(newPrimary); assert.eq(newPrimary, rst.getPrimary()); // Wait for the no-op "new primary" after winning an election, so that we know it has @@ -168,5 +144,5 @@ }); // The extra oplog entries on the old primary are not replicated to the new one. checkOpInOplog(newPrimary, latestOp, 0); - disableFailPoint(originalSecondaries[1]); + restartServerReplication(originalSecondaries[1]); })(); diff --git a/jstests/replsets/read_committed_stale_history.js b/jstests/replsets/read_committed_stale_history.js index 9793a47576e..5751ba9cf28 100644 --- a/jstests/replsets/read_committed_stale_history.js +++ b/jstests/replsets/read_committed_stale_history.js @@ -89,6 +89,9 @@ nodes[0].disconnect(nodes[1]); nodes[0].disconnect(nodes[2]); + // Ensure the soon-to-be primary cannot see the write from the old primary. + assert.eq(null, nodes[1].getDB(dbName).getCollection(collName).findOne({a: 2})); + jsTest.log("Wait for a new primary to be elected"); // Allow the secondaries to replicate again. restartServerReplication(secondaries); @@ -99,6 +102,9 @@ assert.writeOK(nodes[1].getDB(dbName).getCollection(collName).insert( {a: 3}, {writeConcern: {w: 2, wtimeout: rst.kDefaultTimeoutMS}})); + // Ensure the new primary still cannot see the write from the old primary. + assert.eq(null, nodes[1].getDB(dbName).getCollection(collName).findOne({a: 2})); + jsTest.log("Reconnect the old primary to the rest of the nodes"); nodes[1].reconnect(nodes[0]); nodes[2].reconnect(nodes[0]); diff --git a/jstests/replsets/server8070.js b/jstests/replsets/server8070.js index 5c3a4e4a70f..6be51507d4d 100644 --- a/jstests/replsets/server8070.js +++ b/jstests/replsets/server8070.js @@ -3,135 +3,145 @@ // data (member3), then puts 50 more ops in member3's buffer and makes sure that member3 doesn't try // to sync from member2. -// helper to ensure two nodes are at the same place in the oplog -var waitForSameOplogPosition = function(db1, db2, errmsg) { +(function() { + "use strict"; + + load('jstests/libs/write_concern_util.js'); + + // helper to ensure two nodes are at the same place in the oplog + var waitForSameOplogPosition = function(db1, db2, errmsg) { + assert.soon(function() { + var last1 = + db1.getSisterDB("local").oplog.rs.find().sort({$natural: -1}).limit(1).next(); + var last2 = + db2.getSisterDB("local").oplog.rs.find().sort({$natural: -1}).limit(1).next(); + jsTest.log("primary: " + tojson(last1) + " secondary: " + tojson(last2)); + + return ((last1.ts.t === last2.ts.t) && (last1.ts.i === last2.ts.i)); + }, errmsg); + }; + + // start set + var replSet = new ReplSetTest({name: 'testSet', nodes: 3}); + replSet.startSet(); + replSet.initiate({ + _id: 'testSet', + members: [ + {_id: 0, host: getHostName() + ":" + replSet.ports[0]}, + {_id: 1, host: getHostName() + ":" + replSet.ports[1], priority: 0}, + {_id: 2, host: getHostName() + ":" + replSet.ports[2], priority: 0} + ], + settings: {chainingAllowed: false} + }); + + // set up common points of access + var master = replSet.getPrimary(); + var primary = master.getDB("foo"); + replSet.nodes[1].setSlaveOk(); + replSet.nodes[2].setSlaveOk(); + var member2 = replSet.nodes[1].getDB("admin"); + var member3 = replSet.nodes[2].getDB("admin"); + + // Do an initial write + master.getDB("foo").bar.insert({x: 1}); + replSet.awaitReplication(); + + jsTest.log("Make sure 2 & 3 are syncing from the primary"); + member2.adminCommand({replSetSyncFrom: getHostName() + ":" + replSet.ports[0]}); + member3.adminCommand({replSetSyncFrom: getHostName() + ":" + replSet.ports[0]}); + + jsTest.log("Stop 2's replication"); + member2.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'alwaysOn'}); + + jsTest.log("Do a few writes"); + for (var i = 0; i < 25; i++) { + primary.bar.insert({x: i}); + } + + jsTest.log("Make sure 3 is at write #25"); + waitForSameOplogPosition(primary, member3, "node 3 failed to catch up to the primary"); + // This means 3's buffer is empty + + jsTest.log("Stop 3's replication"); + member3.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'alwaysOn'}); + // logLevel 3 will allow us to see each op the secondary pulls from the primary so that we can + // determine whether or not all ops are actually being pulled + member3.runCommand({setParameter: 1, logLevel: 3}); + + jsTest.log("Start 2's replication"); + member2.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'}); + + jsTest.log("Do some writes"); + for (var i = 25; i < 50; i++) { + primary.bar.insert({x: i}); + } + + jsTest.log("Make sure 2 is at write #50"); + waitForSameOplogPosition(primary, member2, "node 2 failed to catch up to the primary"); + // This means 2's buffer is empty + + jsTest.log("Stop 2's replication"); + member2.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'alwaysOn'}); + + jsTest.log( + "Do some writes - 2 & 3 should have up to write #75 in their buffers, but unapplied"); + for (var i = 50; i < 75; i++) { + primary.bar.insert({x: i}); + } + var primaryCollectionSize = primary.bar.find().itcount(); + jsTest.log("primary collection size: " + primaryCollectionSize); + var last = primary.getSisterDB("local").oplog.rs.find().sort({$natural: -1}).limit(1).next(); + + jsTest.log("waiting a bit for the secondaries to get the write"); + sleep(10000); + + jsTest.log("Shut down the primary"); + replSet.stop(0); + + // make sure 3 doesn't try to sync from 2 + // the sleep 30sec is a hold over from the unsafe assert.throws(assert.soon()) + // which would check for 30 seconds that node 3 didn't try to sync from 2 + sleep(30 * 1000); + jsTest.log("3 should not attempt to sync from 2, as it cannot clear its buffer"); + var syncingTo = member3.adminCommand({replSetGetStatus: 1}).syncingTo; + assert(syncingTo !== getHostName() + ":" + replSet.ports[1], + "node 3 is syncing from node 2 :("); + + jsTest.log("Pause 3's bgsync thread"); + stopServerReplication(member3.getMongo()); + + // count documents in member 3 + assert.eq(26, + member3.getSisterDB("foo").bar.find().itcount(), + "collection size incorrect on node 3 before applying ops 25-75"); + + jsTest.log("Allow 3 to apply ops 25-75"); + assert.commandWorked(member3.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'}), + "member 3 rsSyncApplyStop admin command failed"); + assert.soon(function() { - var last1 = db1.getSisterDB("local").oplog.rs.find().sort({$natural: -1}).limit(1).next(); - var last2 = db2.getSisterDB("local").oplog.rs.find().sort({$natural: -1}).limit(1).next(); - jsTest.log("primary: " + tojson(last1) + " secondary: " + tojson(last2)); - - return ((last1.ts.t === last2.ts.t) && (last1.ts.i === last2.ts.i)); - }, errmsg); -}; - -// start set -var replSet = new ReplSetTest({name: 'testSet', nodes: 3}); -replSet.startSet(); -replSet.initiate({ - _id: 'testSet', - members: [ - {_id: 0, host: getHostName() + ":" + replSet.ports[0]}, - {_id: 1, host: getHostName() + ":" + replSet.ports[1], priority: 0}, - {_id: 2, host: getHostName() + ":" + replSet.ports[2], priority: 0} - ], - settings: {chainingAllowed: false} -}); - -// set up common points of access -var master = replSet.getPrimary(); -var primary = master.getDB("foo"); -replSet.nodes[1].setSlaveOk(); -replSet.nodes[2].setSlaveOk(); -var member2 = replSet.nodes[1].getDB("admin"); -var member3 = replSet.nodes[2].getDB("admin"); - -// Do an initial write -master.getDB("foo").bar.insert({x: 1}); -replSet.awaitReplication(); - -jsTest.log("Make sure 2 & 3 are syncing from the primary"); -member2.adminCommand({replSetSyncFrom: getHostName() + ":" + replSet.ports[0]}); -member3.adminCommand({replSetSyncFrom: getHostName() + ":" + replSet.ports[0]}); - -jsTest.log("Stop 2's replication"); -member2.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'alwaysOn'}); - -jsTest.log("Do a few writes"); -for (var i = 0; i < 25; i++) { - primary.bar.insert({x: i}); -} - -jsTest.log("Make sure 3 is at write #25"); -waitForSameOplogPosition(primary, member3, "node 3 failed to catch up to the primary"); -// This means 3's buffer is empty - -jsTest.log("Stop 3's replication"); -member3.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'alwaysOn'}); -// logLevel 3 will allow us to see each op the secondary pulls from the primary so that we can -// determine whether or not all ops are actually being pulled -member3.runCommand({setParameter: 1, logLevel: 3}); - -jsTest.log("Start 2's replication"); -member2.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'}); - -jsTest.log("Do some writes"); -for (var i = 25; i < 50; i++) { - primary.bar.insert({x: i}); -} - -jsTest.log("Make sure 2 is at write #50"); -waitForSameOplogPosition(primary, member2, "node 2 failed to catch up to the primary"); -// This means 2's buffer is empty - -jsTest.log("Stop 2's replication"); -member2.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'alwaysOn'}); - -jsTest.log("Do some writes - 2 & 3 should have up to write #75 in their buffers, but unapplied"); -for (var i = 50; i < 75; i++) { - primary.bar.insert({x: i}); -} -var primaryCollectionSize = primary.bar.find().itcount(); -jsTest.log("primary collection size: " + primaryCollectionSize); -var last = primary.getSisterDB("local").oplog.rs.find().sort({$natural: -1}).limit(1).next(); - -jsTest.log("waiting a bit for the secondaries to get the write"); -sleep(10000); - -jsTest.log("Shut down the primary"); -replSet.stop(0); - -// make sure 3 doesn't try to sync from 2 -// the sleep 30sec is a hold over from the unsafe assert.throws(assert.soon()) -// which would check for 30 seconds that node 3 didn't try to sync from 2 -sleep(30 * 1000); -jsTest.log("3 should not attempt to sync from 2, as it cannot clear its buffer"); -var syncingTo = member3.adminCommand({replSetGetStatus: 1}).syncingTo; -assert(syncingTo !== getHostName() + ":" + replSet.ports[1], "node 3 is syncing from node 2 :("); - -jsTest.log("Pause 3's bgsync thread"); -var pauseRsBgSyncProducerResult3 = - member3.runCommand({configureFailPoint: 'pauseRsBgSyncProducer', mode: 'alwaysOn'}); -assert.eq(1, pauseRsBgSyncProducerResult3.ok, "member 3 pauseRsBgSyncProducer command failed"); - -// count documents in member 3 -assert.eq(26, - member3.getSisterDB("foo").bar.find().itcount(), - "collection size incorrect on node 3 before applying ops 25-75"); - -jsTest.log("Allow 3 to apply ops 25-75"); -assert.commandWorked(member3.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'}), - "member 3 rsSyncApplyStop admin command failed"); - -assert.soon(function() { - var last3 = member3.getSisterDB("local").oplog.rs.find().sort({$natural: -1}).limit(1).next(); - jsTest.log("primary: " + tojson(last, '', true) + " secondary: " + tojson(last3, '', true)); - jsTest.log("member 3 collection size: " + member3.getSisterDB("foo").bar.find().itcount()); - jsTest.log("curop: "); - printjson(member3.getSisterDB("foo").currentOp(true)); - return ((last.ts.t === last3.ts.t) && (last.ts.i === last3.ts.i)); -}, "Replication member 3 did not apply ops 25-75"); - -jsTest.log("Start 3's bgsync thread"); -member3.runCommand({configureFailPoint: 'pauseRsBgSyncProducer', mode: 'off'}); - -jsTest.log("Node 3 shouldn't hit rollback"); -var end = (new Date()).getTime() + 10000; -while ((new Date()).getTime() < end) { - assert('ROLLBACK' !== member3.runCommand({replSetGetStatus: 1}).members[2].stateStr); - sleep(30); -} - -// Need to re-enable writes before clean shutdown. -assert.commandWorked(member2.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'})); - -replSet.stopSet(); + var last3 = + member3.getSisterDB("local").oplog.rs.find().sort({$natural: -1}).limit(1).next(); + jsTest.log("primary: " + tojson(last, '', true) + " secondary: " + tojson(last3, '', true)); + jsTest.log("member 3 collection size: " + member3.getSisterDB("foo").bar.find().itcount()); + jsTest.log("curop: "); + printjson(member3.getSisterDB("foo").currentOp(true)); + return ((last.ts.t === last3.ts.t) && (last.ts.i === last3.ts.i)); + }, "Replication member 3 did not apply ops 25-75"); + + jsTest.log("Start 3's bgsync thread"); + restartServerReplication(member3.getMongo()); + + jsTest.log("Node 3 shouldn't hit rollback"); + var end = (new Date()).getTime() + 10000; + while ((new Date()).getTime() < end) { + assert('ROLLBACK' !== member3.runCommand({replSetGetStatus: 1}).members[2].stateStr); + sleep(30); + } + + // Need to re-enable writes before clean shutdown. + assert.commandWorked(member2.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'})); + + replSet.stopSet(); + +}());
\ No newline at end of file diff --git a/jstests/sharding/config_version_rollback.js b/jstests/sharding/config_version_rollback.js index 8afff0ef0f0..2ad468b05bc 100644 --- a/jstests/sharding/config_version_rollback.js +++ b/jstests/sharding/config_version_rollback.js @@ -7,6 +7,8 @@ (function() { "use strict"; + load("jstests/libs/write_concern_util.js"); + // The config.version document is written on transition to primary. We need to ensure this // config.version document is rolled back for this test. // @@ -56,12 +58,9 @@ jsTest.log("Waiting for " + nodes[1] + " and " + nodes[2] + " to transition to SECONDARY."); configRS.waitForState([nodes[1], nodes[2]], ReplSetTest.State.SECONDARY); - jsTest.log("Stopping the OplogFetcher on all nodes"); + jsTest.log("Stopping the replication producer on all nodes"); // Now that the secondaries have finished initial sync and are electable, stop replication. - nodes.forEach(function(node) { - assert.commandWorked(node.getDB('admin').runCommand( - {configureFailPoint: 'stopOplogFetcher', mode: 'alwaysOn'})); - }); + stopServerReplication([nodes[1], nodes[2]]); jsTest.log("Allowing the primary to write the config.version doc"); nodes.forEach(function(node) { @@ -115,10 +114,7 @@ assert.neq(origConfigVersionDoc.clusterId, newConfigVersionDoc.clusterId); jsTest.log("Re-enabling replication on all nodes"); - nodes.forEach(function(node) { - assert.commandWorked( - node.getDB('admin').runCommand({configureFailPoint: 'stopOplogFetcher', mode: 'off'})); - }); + restartServerReplication(nodes); jsTest.log( "Waiting for original primary to rollback and replicate new config.version document"); diff --git a/jstests/sharding/shard_identity_rollback.js b/jstests/sharding/shard_identity_rollback.js index ac899c5edce..c7b6fedaacc 100644 --- a/jstests/sharding/shard_identity_rollback.js +++ b/jstests/sharding/shard_identity_rollback.js @@ -7,6 +7,8 @@ (function() { "use strict"; + load('jstests/libs/write_concern_util.js'); + var st = new ShardingTest({shards: 1}); var replTest = new ReplSetTest({nodes: 3}); @@ -23,14 +25,7 @@ replTest.awaitSecondaryNodes(); replTest.awaitReplication(); - nodes.forEach(function(node) { - // Pause bgsync so it doesn't keep trying to sync from other nodes. - assert.commandWorked( - node.adminCommand({configureFailPoint: 'pauseRsBgSyncProducer', mode: 'alwaysOn'})); - // Stop oplog fetcher so that the ongoing fetcher doesn't return anything new. - assert.commandWorked( - node.adminCommand({configureFailPoint: 'stopOplogFetcher', mode: 'alwaysOn'})); - }); + stopServerReplication(secondaries); jsTest.log("inserting shardIdentity document to primary that shouldn't replicate"); @@ -70,18 +65,7 @@ // Disable the fail point so that the elected node can exit drain mode and finish becoming // primary. - secondaries.forEach(function(secondary) { - assert.commandWorked( - secondary.adminCommand({configureFailPoint: 'stopOplogFetcher', mode: 'off'})); - try { - assert.commandWorked( - secondary.adminCommand({configureFailPoint: 'pauseRsBgSyncProducer', mode: 'off'})); - } catch (e) { - // Enabling bgsync producer may cause rollback, which will close all connections - // including the one sending "configureFailPoint". - print("got exception when disabling fail point 'pauseRsBgSyncProducer': " + e); - } - }); + restartServerReplication(secondaries); // Wait for a new healthy primary var newPriConn = replTest.getPrimary(); diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp index 7a5a088aae5..a4943f3f8a4 100644 --- a/src/mongo/db/repl/bgsync.cpp +++ b/src/mongo/db/repl/bgsync.cpp @@ -108,8 +108,6 @@ size_t getSize(const BSONObj& o) { } } // namespace -MONGO_FP_DECLARE(pauseRsBgSyncProducer); - // Failpoint which causes rollback to hang before starting. MONGO_FP_DECLARE(rollbackHangBeforeStart); @@ -259,12 +257,19 @@ void BackgroundSync::_runProducer() { } void BackgroundSync::_produce(OperationContext* txn) { + if (MONGO_FAIL_POINT(stopReplProducer)) { + // This log output is used in js tests so please leave it. + log() << "bgsync - stopReplProducer fail point " + "enabled. Blocking until fail point is disabled."; - while (MONGO_FAIL_POINT(pauseRsBgSyncProducer)) { - if (inShutdown()) { - return; - } - sleepmillis(10); + // TODO(SERVER-27120): Remove the return statement and uncomment the while loop. + // Currently we cannot block here or we prevent primaries from being fully elected since + // we'll never call _signalNoNewDataForApplier. + // while (MONGO_FAIL_POINT(stopReplProducer) && !inShutdown()) { + // mongo::sleepsecs(1); + // } + mongo::sleepsecs(1); + return; } // this oplog reader does not do a handshake because we don't want the server it's syncing @@ -383,10 +388,6 @@ void BackgroundSync::_produce(OperationContext* txn) { StorageInterface::get(txn)->setAppliedThrough(txn, _replCoord->getMyLastAppliedOpTime()); } - if (MONGO_FAIL_POINT(stopOplogFetcher)) { - return; - } - // "lastFetched" not used. Already set in _enqueueDocuments. Status fetcherReturnStatus = Status::OK(); DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState( diff --git a/src/mongo/db/repl/data_replicator.cpp b/src/mongo/db/repl/data_replicator.cpp index f645e6a385a..4c25c9d43c9 100644 --- a/src/mongo/db/repl/data_replicator.cpp +++ b/src/mongo/db/repl/data_replicator.cpp @@ -753,7 +753,7 @@ void DataReplicator::_oplogFetcherCallback(const Status& oplogFetcherFinishStatu // It is up to the DatabasesCloner and MultiApplier to determine if they can proceed without any // additional data going into the oplog buffer. // It is not common for the OplogFetcher to return with an OK status. The only time it returns - // an OK status is when the 'stopOplogFetcher' fail point is enabled, which causes the + // an OK status is when the 'stopReplProducer' fail point is enabled, which causes the // OplogFetcher to ignore the current sync source response and return early. if (status.isOK()) { log() << "Finished fetching oplog fetching early. Last fetched optime and hash: " diff --git a/src/mongo/db/repl/oplog_fetcher.cpp b/src/mongo/db/repl/oplog_fetcher.cpp index 0bba80369cd..f976b9ac18d 100644 --- a/src/mongo/db/repl/oplog_fetcher.cpp +++ b/src/mongo/db/repl/oplog_fetcher.cpp @@ -51,7 +51,7 @@ namespace repl { Seconds OplogFetcher::kDefaultProtocolZeroAwaitDataTimeout(2); -MONGO_FP_DECLARE(stopOplogFetcher); +MONGO_FP_DECLARE(stopReplProducer); namespace { @@ -423,12 +423,10 @@ void OplogFetcher::_callback(const Fetcher::QueryResponseStatus& result, } // Stop fetching and return on fail point. - // This fail point is intended to make the oplog fetcher ignore the downloaded batch of - // operations and not error out. - if (MONGO_FAIL_POINT(stopOplogFetcher)) { + // This fail point makes the oplog fetcher ignore the downloaded batch of operations and not + // error out. + if (MONGO_FAIL_POINT(stopReplProducer)) { _finishCallback(Status::OK()); - // Wait for a while, otherwise, it will keep busy waiting. - sleepmillis(100); return; } diff --git a/src/mongo/db/repl/oplog_fetcher.h b/src/mongo/db/repl/oplog_fetcher.h index 987e33a1df9..0078471bf85 100644 --- a/src/mongo/db/repl/oplog_fetcher.h +++ b/src/mongo/db/repl/oplog_fetcher.h @@ -48,7 +48,7 @@ namespace mongo { namespace repl { -MONGO_FP_FORWARD_DECLARE(stopOplogFetcher); +MONGO_FP_FORWARD_DECLARE(stopReplProducer); /** * Used to keep track of the optime and hash of the last fetched operation. |