SERVER-27680 Merge stopOplogFetcher and pauseRsBgSyncProducer failpoint into single stopReplProducer failpoint

(cherry picked from commit 21948042b6da5fb5bf15897f9808a70551f5af09)
author: Spencer T Brody <spencer@mongodb.com> 2017-01-17 17:06:52 -0500
committer: Spencer T Brody <spencer@mongodb.com> 2017-01-23 16:18:50 -0500
commit: aca85db93035c35887f7ecf5a64f972bb0a2c82a (patch)
tree: 2deb4fe13f348a66e8905234482ffa4973f763e5
parent: f0cf61dbcdc569e1b3642097787068c7b1139273 (diff)
download: mongo-aca85db93035c35887f7ecf5a64f972bb0a2c82a.tar.gz
11 files changed, 247 insertions, 260 deletions
diff --git a/jstests/libs/check_log.js b/jstests/libs/check_log.js
index 8cb064f9558..5a9ba10435a 100644
--- a/jstests/libs/check_log.js
+++ b/jstests/libs/check_log.js
@@ -1,53 +1,64 @@
-"use strict";
-
 /*
  * Helper functions which connect to a server, and check its logs for particular strings.
  */
-var checkLog = (function() {
-    /*
-     * Calls the 'getLog' function at regular intervals on the provided connection 'conn' until the
-     * provided 'msg' is found in the logs, or 5 minutes have elapsed. Throws an exception on
-     * timeout.
-     */
-    var contains = function(conn, msg) {
-        assert.soon(
-            function() {
-                var logMessages = assert.commandWorked(conn.adminCommand({getLog: 'global'})).log;
-                for (var i = 0; i < logMessages.length; i++) {
-                    if (logMessages[i].indexOf(msg) != -1) {
-                        return true;
+var checkLog;
+
+(function() {
+    "use strict";
+
+    if (checkLog) {
+        return;  // Protect against this file being double-loaded.
+    }
+
+    checkLog = (function() {
+        /*
+         * Calls the 'getLog' function at regular intervals on the provided connection 'conn' until
+         * the provided 'msg' is found in the logs, or 5 minutes have elapsed. Throws an exception
+         * on timeout.
+         */
+        var contains = function(conn, msg) {
+            assert.soon(
+                function() {
+                    var logMessages =
+                        assert.commandWorked(conn.adminCommand({getLog: 'global'})).log;
+                    for (var i = 0; i < logMessages.length; i++) {
+                        if (logMessages[i].indexOf(msg) != -1) {
+                            return true;
+                        }
                     }
-                }
-                return false;
-            },
-            'Could not find log entries containing the following message: ' + msg,
-            5 * 60 * 1000,
-            300);
-    };
+                    return false;
+                },
+                'Could not find log entries containing the following message: ' + msg,
+                5 * 60 * 1000,
+                300);
+        };
 
-    /*
-     * Calls the 'getLog' function at regular intervals on the provided connection 'conn' until the
-     * provided 'msg' is found in the logs exactly 'expectedCount' times, or 5 minutes have elapsed.
-     * Throws an exception on timeout.
-     */
-    var containsWithCount = function(conn, msg, expectedCount) {
-        var count = 0;
-        assert.soon(
-            function() {
-                var logMessages = assert.commandWorked(conn.adminCommand({getLog: 'global'})).log;
-                for (var i = 0; i < logMessages.length; i++) {
-                    if (logMessages[i].indexOf(msg) != -1) {
-                        count++;
+        /*
+         * Calls the 'getLog' function at regular intervals on the provided connection 'conn' until
+         * the provided 'msg' is found in the logs exactly 'expectedCount' times, or 5 minutes have
+         * elapsed.
+         * Throws an exception on timeout.
+         */
+        var containsWithCount = function(conn, msg, expectedCount) {
+            var count = 0;
+            assert.soon(
+                function() {
+                    var logMessages =
+                        assert.commandWorked(conn.adminCommand({getLog: 'global'})).log;
+                    for (var i = 0; i < logMessages.length; i++) {
+                        if (logMessages[i].indexOf(msg) != -1) {
+                            count++;
+                        }
                     }
-                }
 
-                return expectedCount === count;
-            },
-            'Expected ' + expectedCount + ', but instead saw ' + count +
-                ' log entries containing the following message: ' + msg,
-            5 * 60 * 1000,
-            300);
-    };
+                    return expectedCount === count;
+                },
+                'Expected ' + expectedCount + ', but instead saw ' + count +
+                    ' log entries containing the following message: ' + msg,
+                5 * 60 * 1000,
+                300);
+        };
 
-    return {contains: contains, containsWithCount: containsWithCount};
+        return {contains: contains, containsWithCount: containsWithCount};
+    })();
 })();
diff --git a/jstests/libs/write_concern_util.js b/jstests/libs/write_concern_util.js
index d7541b3b9a6..0115fda8c4c 100644
--- a/jstests/libs/write_concern_util.js
+++ b/jstests/libs/write_concern_util.js
@@ -2,6 +2,8 @@
  * Utilities for testing writeConcern.
  */
 
+load("jstests/libs/check_log.js");
+
 // Shards a collection and creates 2 chunks, one on each s of two shards.
 function shardCollectionWithChunks(st, coll) {
     var _db = coll.getDB();
@@ -25,10 +27,13 @@ function stopServerReplication(conn) {
         });
         return;
     }
-    var errMsg = 'Failed to enable stopOplogFetcher failpoint.';
+    var errMsg = 'Failed to enable stopReplProducer failpoint.';
     assert.commandWorked(
-        conn.getDB('admin').runCommand({configureFailPoint: 'stopOplogFetcher', mode: 'alwaysOn'}),
+        conn.getDB('admin').runCommand({configureFailPoint: 'stopReplProducer', mode: 'alwaysOn'}),
         errMsg);
+
+    // Wait until the fail point is actually hit.
+    checkLog.contains(conn, 'bgsync - stopReplProducer fail point enabled');
 }
 
 // Stops replication at all replicaset secondaries.
@@ -50,9 +55,9 @@ function restartServerReplication(conn) {
         return;
     }
 
-    var errMsg = 'Failed to disable stopOplogFetcher failpoint.';
+    var errMsg = 'Failed to disable stopReplProducer failpoint.';
     assert.commandWorked(
-        conn.getDB('admin').runCommand({configureFailPoint: 'stopOplogFetcher', mode: 'off'}),
+        conn.getDB('admin').runCommand({configureFailPoint: 'stopReplProducer', mode: 'off'}),
         errMsg);
 }
 
diff --git a/jstests/replsets/catchup.js b/jstests/replsets/catchup.js
index 8be2d20da93..8cb3ec68ae9 100644
--- a/jstests/replsets/catchup.js
+++ b/jstests/replsets/catchup.js
@@ -4,6 +4,7 @@
     "use strict";
 
     load("jstests/libs/check_log.js");
+    load("jstests/libs/write_concern_util.js");
     load("jstests/replsets/rslib.js");
 
     var name = "catch_up";
@@ -33,31 +34,6 @@
         node.adminCommand(verbosity);
     });
 
-    function enableFailPoint(node) {
-        jsTest.log("enable failpoint " + node.host);
-        // Disable syncing on both secondaries.
-        assert.commandWorked(
-            node.adminCommand({configureFailPoint: 'pauseRsBgSyncProducer', mode: 'alwaysOn'}),
-            'Failed to configure pauseRsBgSyncProducer failpoint.');
-        assert.commandWorked(
-            node.adminCommand({configureFailPoint: 'stopOplogFetcher', mode: 'alwaysOn'}));
-    }
-
-    function disableFailPoint(node) {
-        jsTest.log("disable failpoint " + node.host);
-        assert.commandWorked(
-            node.adminCommand({configureFailPoint: 'stopOplogFetcher', mode: 'off'}));
-        try {
-            assert.commandWorked(
-                node.adminCommand({configureFailPoint: 'pauseRsBgSyncProducer', mode: 'off'}),
-                'Failed to disable pauseRsBgSyncProducer failpoint.');
-        } catch (e) {
-            // Enable bgsync producer may cause rollback, which will close all connections
-            // including the one sending "configureFailPoint".
-            print("got exception when disabling fail point 'pauseRsBgSyncProducer': " + e);
-        }
-    }
-
     function stepUp(node) {
         assert.commandWorked(node.adminCommand({replSetStepUp: 1}));
         return node;
@@ -96,7 +72,7 @@
     jsTest.log("Case 2: The primary needs to catch up, succeeds in time.");
     // Write documents that cannot be replicated to secondaries in time.
     var originalSecondaries = rst.getSecondaries();
-    originalSecondaries.forEach(enableFailPoint);
+    stopServerReplication(originalSecondaries);
     doWrites(rst.getPrimary());
     var latestOp = getLatestOp(rst.getPrimary());
     // New primary wins immediately, but needs to catch up.
@@ -105,7 +81,7 @@
     // Check this node is not writable.
     assert.eq(newPrimary.getDB("test").isMaster().ismaster, false);
     // Disable fail point to allow replication.
-    originalSecondaries.forEach(disableFailPoint);
+    restartServerReplication(originalSecondaries);
     // getPrimary() blocks until the primary finishes drain mode.
     assert.eq(newPrimary, rst.getPrimary());
     // Wait for all secondaries to catch up
@@ -116,7 +92,7 @@
 
     jsTest.log("Case 3: The primary needs to catch up, but has to change sync source to catch up.");
     // Write documents that cannot be replicated to secondaries in time.
-    rst.getSecondaries().forEach(enableFailPoint);
+    stopServerReplication(rst.getSecondaries());
     doWrites(rst.getPrimary());
     var oldPrimary = rst.getPrimary();
     originalSecondaries = rst.getSecondaries();
@@ -125,13 +101,13 @@
     rst.awaitNodesAgreeOnPrimary();
     // Disable fail point on one of the other secondaries.
     // Wait until it catches up with the old primary.
-    disableFailPoint(originalSecondaries[1]);
+    restartServerReplication(originalSecondaries[1]);
     assert.commandWorked(originalSecondaries[1].adminCommand({replSetSyncFrom: oldPrimary.host}));
     awaitOpTime(originalSecondaries[1], latestOp.ts);
     // Disconnect the new primary and the old one.
     oldPrimary.disconnect(newPrimary);
     // Disable the failpoint, the new primary should sync from the other secondary.
-    disableFailPoint(newPrimary);
+    restartServerReplication(newPrimary);
     assert.eq(newPrimary, rst.getPrimary());
     checkOpInOplog(newPrimary, latestOp, 1);
     // Restore the broken connection
@@ -148,7 +124,7 @@
 
     // Write documents that cannot be replicated to secondaries in time.
     originalSecondaries = rst.getSecondaries();
-    originalSecondaries.forEach(enableFailPoint);
+    stopServerReplication(originalSecondaries);
     doWrites(rst.getPrimary());
     latestOp = getLatestOp(rst.getPrimary());
 
@@ -158,7 +134,7 @@
     var latestOpOnNewPrimary = getLatestOp(newPrimary);
     // Wait until the new primary completes the transition to primary and writes a no-op.
     checkLog.contains(newPrimary, "Cannot catch up oplog after becoming primary");
-    disableFailPoint(newPrimary);
+    restartServerReplication(newPrimary);
     assert.eq(newPrimary, rst.getPrimary());
 
     // Wait for the no-op "new primary" after winning an election, so that we know it has
@@ -168,5 +144,5 @@
     });
     // The extra oplog entries on the old primary are not replicated to the new one.
     checkOpInOplog(newPrimary, latestOp, 0);
-    disableFailPoint(originalSecondaries[1]);
+    restartServerReplication(originalSecondaries[1]);
 })();
diff --git a/jstests/replsets/read_committed_stale_history.js b/jstests/replsets/read_committed_stale_history.js
index 9793a47576e..5751ba9cf28 100644
--- a/jstests/replsets/read_committed_stale_history.js
+++ b/jstests/replsets/read_committed_stale_history.js
@@ -89,6 +89,9 @@
     nodes[0].disconnect(nodes[1]);
     nodes[0].disconnect(nodes[2]);
 
+    // Ensure the soon-to-be primary cannot see the write from the old primary.
+    assert.eq(null, nodes[1].getDB(dbName).getCollection(collName).findOne({a: 2}));
+
     jsTest.log("Wait for a new primary to be elected");
     // Allow the secondaries to replicate again.
     restartServerReplication(secondaries);
@@ -99,6 +102,9 @@
     assert.writeOK(nodes[1].getDB(dbName).getCollection(collName).insert(
         {a: 3}, {writeConcern: {w: 2, wtimeout: rst.kDefaultTimeoutMS}}));
 
+    // Ensure the new primary still cannot see the write from the old primary.
+    assert.eq(null, nodes[1].getDB(dbName).getCollection(collName).findOne({a: 2}));
+
     jsTest.log("Reconnect the old primary to the rest of the nodes");
     nodes[1].reconnect(nodes[0]);
     nodes[2].reconnect(nodes[0]);
diff --git a/jstests/replsets/server8070.js b/jstests/replsets/server8070.js
index 5c3a4e4a70f..6be51507d4d 100644
--- a/jstests/replsets/server8070.js
+++ b/jstests/replsets/server8070.js
@@ -3,135 +3,145 @@
 // data (member3), then puts 50 more ops in member3's buffer and makes sure that member3 doesn't try
 // to sync from member2.
 
-// helper to ensure two nodes are at the same place in the oplog
-var waitForSameOplogPosition = function(db1, db2, errmsg) {
+(function() {
+    "use strict";
+
+    load('jstests/libs/write_concern_util.js');
+
+    // helper to ensure two nodes are at the same place in the oplog
+    var waitForSameOplogPosition = function(db1, db2, errmsg) {
+        assert.soon(function() {
+            var last1 =
+                db1.getSisterDB("local").oplog.rs.find().sort({$natural: -1}).limit(1).next();
+            var last2 =
+                db2.getSisterDB("local").oplog.rs.find().sort({$natural: -1}).limit(1).next();
+            jsTest.log("primary: " + tojson(last1) + " secondary: " + tojson(last2));
+
+            return ((last1.ts.t === last2.ts.t) && (last1.ts.i === last2.ts.i));
+        }, errmsg);
+    };
+
+    // start set
+    var replSet = new ReplSetTest({name: 'testSet', nodes: 3});
+    replSet.startSet();
+    replSet.initiate({
+        _id: 'testSet',
+        members: [
+            {_id: 0, host: getHostName() + ":" + replSet.ports[0]},
+            {_id: 1, host: getHostName() + ":" + replSet.ports[1], priority: 0},
+            {_id: 2, host: getHostName() + ":" + replSet.ports[2], priority: 0}
+        ],
+        settings: {chainingAllowed: false}
+    });
+
+    // set up common points of access
+    var master = replSet.getPrimary();
+    var primary = master.getDB("foo");
+    replSet.nodes[1].setSlaveOk();
+    replSet.nodes[2].setSlaveOk();
+    var member2 = replSet.nodes[1].getDB("admin");
+    var member3 = replSet.nodes[2].getDB("admin");
+
+    // Do an initial write
+    master.getDB("foo").bar.insert({x: 1});
+    replSet.awaitReplication();
+
+    jsTest.log("Make sure 2 & 3 are syncing from the primary");
+    member2.adminCommand({replSetSyncFrom: getHostName() + ":" + replSet.ports[0]});
+    member3.adminCommand({replSetSyncFrom: getHostName() + ":" + replSet.ports[0]});
+
+    jsTest.log("Stop 2's replication");
+    member2.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'alwaysOn'});
+
+    jsTest.log("Do a few writes");
+    for (var i = 0; i < 25; i++) {
+        primary.bar.insert({x: i});
+    }
+
+    jsTest.log("Make sure 3 is at write #25");
+    waitForSameOplogPosition(primary, member3, "node 3 failed to catch up to the primary");
+    // This means 3's buffer is empty
+
+    jsTest.log("Stop 3's replication");
+    member3.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'alwaysOn'});
+    // logLevel 3 will allow us to see each op the secondary pulls from the primary so that we can
+    // determine whether or not all ops are actually being pulled
+    member3.runCommand({setParameter: 1, logLevel: 3});
+
+    jsTest.log("Start 2's replication");
+    member2.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'});
+
+    jsTest.log("Do some writes");
+    for (var i = 25; i < 50; i++) {
+        primary.bar.insert({x: i});
+    }
+
+    jsTest.log("Make sure 2 is at write #50");
+    waitForSameOplogPosition(primary, member2, "node 2 failed to catch up to the primary");
+    // This means 2's buffer is empty
+
+    jsTest.log("Stop 2's replication");
+    member2.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'alwaysOn'});
+
+    jsTest.log(
+        "Do some writes - 2 & 3 should have up to write #75 in their buffers, but unapplied");
+    for (var i = 50; i < 75; i++) {
+        primary.bar.insert({x: i});
+    }
+    var primaryCollectionSize = primary.bar.find().itcount();
+    jsTest.log("primary collection size: " + primaryCollectionSize);
+    var last = primary.getSisterDB("local").oplog.rs.find().sort({$natural: -1}).limit(1).next();
+
+    jsTest.log("waiting a bit for the secondaries to get the write");
+    sleep(10000);
+
+    jsTest.log("Shut down the primary");
+    replSet.stop(0);
+
+    // make sure 3 doesn't try to sync from 2
+    // the sleep 30sec is a hold over from the unsafe assert.throws(assert.soon())
+    // which would check for 30 seconds that node 3 didn't try to sync from 2
+    sleep(30 * 1000);
+    jsTest.log("3 should not attempt to sync from 2, as it cannot clear its buffer");
+    var syncingTo = member3.adminCommand({replSetGetStatus: 1}).syncingTo;
+    assert(syncingTo !== getHostName() + ":" + replSet.ports[1],
+           "node 3 is syncing from node 2 :(");
+
+    jsTest.log("Pause 3's bgsync thread");
+    stopServerReplication(member3.getMongo());
+
+    // count documents in member 3
+    assert.eq(26,
+              member3.getSisterDB("foo").bar.find().itcount(),
+              "collection size incorrect on node 3 before applying ops 25-75");
+
+    jsTest.log("Allow 3 to apply ops 25-75");
+    assert.commandWorked(member3.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'}),
+                         "member 3 rsSyncApplyStop admin command failed");
+
     assert.soon(function() {
-        var last1 = db1.getSisterDB("local").oplog.rs.find().sort({$natural: -1}).limit(1).next();
-        var last2 = db2.getSisterDB("local").oplog.rs.find().sort({$natural: -1}).limit(1).next();
-        jsTest.log("primary: " + tojson(last1) + " secondary: " + tojson(last2));
-
-        return ((last1.ts.t === last2.ts.t) && (last1.ts.i === last2.ts.i));
-    }, errmsg);
-};
-
-// start set
-var replSet = new ReplSetTest({name: 'testSet', nodes: 3});
-replSet.startSet();
-replSet.initiate({
-    _id: 'testSet',
-    members: [
-        {_id: 0, host: getHostName() + ":" + replSet.ports[0]},
-        {_id: 1, host: getHostName() + ":" + replSet.ports[1], priority: 0},
-        {_id: 2, host: getHostName() + ":" + replSet.ports[2], priority: 0}
-    ],
-    settings: {chainingAllowed: false}
-});
-
-// set up common points of access
-var master = replSet.getPrimary();
-var primary = master.getDB("foo");
-replSet.nodes[1].setSlaveOk();
-replSet.nodes[2].setSlaveOk();
-var member2 = replSet.nodes[1].getDB("admin");
-var member3 = replSet.nodes[2].getDB("admin");
-
-// Do an initial write
-master.getDB("foo").bar.insert({x: 1});
-replSet.awaitReplication();
-
-jsTest.log("Make sure 2 & 3 are syncing from the primary");
-member2.adminCommand({replSetSyncFrom: getHostName() + ":" + replSet.ports[0]});
-member3.adminCommand({replSetSyncFrom: getHostName() + ":" + replSet.ports[0]});
-
-jsTest.log("Stop 2's replication");
-member2.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'alwaysOn'});
-
-jsTest.log("Do a few writes");
-for (var i = 0; i < 25; i++) {
-    primary.bar.insert({x: i});
-}
-
-jsTest.log("Make sure 3 is at write #25");
-waitForSameOplogPosition(primary, member3, "node 3 failed to catch up to the primary");
-// This means 3's buffer is empty
-
-jsTest.log("Stop 3's replication");
-member3.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'alwaysOn'});
-// logLevel 3 will allow us to see each op the secondary pulls from the primary so that we can
-// determine whether or not all ops are actually being pulled
-member3.runCommand({setParameter: 1, logLevel: 3});
-
-jsTest.log("Start 2's replication");
-member2.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'});
-
-jsTest.log("Do some writes");
-for (var i = 25; i < 50; i++) {
-    primary.bar.insert({x: i});
-}
-
-jsTest.log("Make sure 2 is at write #50");
-waitForSameOplogPosition(primary, member2, "node 2 failed to catch up to the primary");
-// This means 2's buffer is empty
-
-jsTest.log("Stop 2's replication");
-member2.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'alwaysOn'});
-
-jsTest.log("Do some writes - 2 & 3 should have up to write #75 in their buffers, but unapplied");
-for (var i = 50; i < 75; i++) {
-    primary.bar.insert({x: i});
-}
-var primaryCollectionSize = primary.bar.find().itcount();
-jsTest.log("primary collection size: " + primaryCollectionSize);
-var last = primary.getSisterDB("local").oplog.rs.find().sort({$natural: -1}).limit(1).next();
-
-jsTest.log("waiting a bit for the secondaries to get the write");
-sleep(10000);
-
-jsTest.log("Shut down the primary");
-replSet.stop(0);
-
-// make sure 3 doesn't try to sync from 2
-// the sleep 30sec is a hold over from the unsafe assert.throws(assert.soon())
-// which would check for 30 seconds that node 3 didn't try to sync from 2
-sleep(30 * 1000);
-jsTest.log("3 should not attempt to sync from 2, as it cannot clear its buffer");
-var syncingTo = member3.adminCommand({replSetGetStatus: 1}).syncingTo;
-assert(syncingTo !== getHostName() + ":" + replSet.ports[1], "node 3 is syncing from node 2 :(");
-
-jsTest.log("Pause 3's bgsync thread");
-var pauseRsBgSyncProducerResult3 =
-    member3.runCommand({configureFailPoint: 'pauseRsBgSyncProducer', mode: 'alwaysOn'});
-assert.eq(1, pauseRsBgSyncProducerResult3.ok, "member 3 pauseRsBgSyncProducer command failed");
-
-// count documents in member 3
-assert.eq(26,
-          member3.getSisterDB("foo").bar.find().itcount(),
-          "collection size incorrect on node 3 before applying ops 25-75");
-
-jsTest.log("Allow 3 to apply ops 25-75");
-assert.commandWorked(member3.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'}),
-                     "member 3 rsSyncApplyStop admin command failed");
-
-assert.soon(function() {
-    var last3 = member3.getSisterDB("local").oplog.rs.find().sort({$natural: -1}).limit(1).next();
-    jsTest.log("primary: " + tojson(last, '', true) + " secondary: " + tojson(last3, '', true));
-    jsTest.log("member 3 collection size: " + member3.getSisterDB("foo").bar.find().itcount());
-    jsTest.log("curop: ");
-    printjson(member3.getSisterDB("foo").currentOp(true));
-    return ((last.ts.t === last3.ts.t) && (last.ts.i === last3.ts.i));
-}, "Replication member 3 did not apply ops 25-75");
-
-jsTest.log("Start 3's bgsync thread");
-member3.runCommand({configureFailPoint: 'pauseRsBgSyncProducer', mode: 'off'});
-
-jsTest.log("Node 3 shouldn't hit rollback");
-var end = (new Date()).getTime() + 10000;
-while ((new Date()).getTime() < end) {
-    assert('ROLLBACK' !== member3.runCommand({replSetGetStatus: 1}).members[2].stateStr);
-    sleep(30);
-}
-
-// Need to re-enable writes before clean shutdown.
-assert.commandWorked(member2.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'}));
-
-replSet.stopSet();
+        var last3 =
+            member3.getSisterDB("local").oplog.rs.find().sort({$natural: -1}).limit(1).next();
+        jsTest.log("primary: " + tojson(last, '', true) + " secondary: " + tojson(last3, '', true));
+        jsTest.log("member 3 collection size: " + member3.getSisterDB("foo").bar.find().itcount());
+        jsTest.log("curop: ");
+        printjson(member3.getSisterDB("foo").currentOp(true));
+        return ((last.ts.t === last3.ts.t) && (last.ts.i === last3.ts.i));
+    }, "Replication member 3 did not apply ops 25-75");
+
+    jsTest.log("Start 3's bgsync thread");
+    restartServerReplication(member3.getMongo());
+
+    jsTest.log("Node 3 shouldn't hit rollback");
+    var end = (new Date()).getTime() + 10000;
+    while ((new Date()).getTime() < end) {
+        assert('ROLLBACK' !== member3.runCommand({replSetGetStatus: 1}).members[2].stateStr);
+        sleep(30);
+    }
+
+    // Need to re-enable writes before clean shutdown.
+    assert.commandWorked(member2.runCommand({configureFailPoint: 'rsSyncApplyStop', mode: 'off'}));
+
+    replSet.stopSet();
+
+}());
+\ No newline at end of file
diff --git a/jstests/sharding/config_version_rollback.js b/jstests/sharding/config_version_rollback.js
index 8afff0ef0f0..2ad468b05bc 100644
--- a/jstests/sharding/config_version_rollback.js
+++ b/jstests/sharding/config_version_rollback.js
@@ -7,6 +7,8 @@
 (function() {
     "use strict";
 
+    load("jstests/libs/write_concern_util.js");
+
     // The config.version document is written on transition to primary. We need to ensure this
     // config.version document is rolled back for this test.
     //
@@ -56,12 +58,9 @@
     jsTest.log("Waiting for " + nodes[1] + " and " + nodes[2] + " to transition to SECONDARY.");
     configRS.waitForState([nodes[1], nodes[2]], ReplSetTest.State.SECONDARY);
 
-    jsTest.log("Stopping the OplogFetcher on all nodes");
+    jsTest.log("Stopping the replication producer on all nodes");
     // Now that the secondaries have finished initial sync and are electable, stop replication.
-    nodes.forEach(function(node) {
-        assert.commandWorked(node.getDB('admin').runCommand(
-            {configureFailPoint: 'stopOplogFetcher', mode: 'alwaysOn'}));
-    });
+    stopServerReplication([nodes[1], nodes[2]]);
 
     jsTest.log("Allowing the primary to write the config.version doc");
     nodes.forEach(function(node) {
@@ -115,10 +114,7 @@
     assert.neq(origConfigVersionDoc.clusterId, newConfigVersionDoc.clusterId);
 
     jsTest.log("Re-enabling replication on all nodes");
-    nodes.forEach(function(node) {
-        assert.commandWorked(
-            node.getDB('admin').runCommand({configureFailPoint: 'stopOplogFetcher', mode: 'off'}));
-    });
+    restartServerReplication(nodes);
 
     jsTest.log(
         "Waiting for original primary to rollback and replicate new config.version document");
diff --git a/jstests/sharding/shard_identity_rollback.js b/jstests/sharding/shard_identity_rollback.js
index ac899c5edce..c7b6fedaacc 100644
--- a/jstests/sharding/shard_identity_rollback.js
+++ b/jstests/sharding/shard_identity_rollback.js
@@ -7,6 +7,8 @@
 (function() {
     "use strict";
 
+    load('jstests/libs/write_concern_util.js');
+
     var st = new ShardingTest({shards: 1});
 
     var replTest = new ReplSetTest({nodes: 3});
@@ -23,14 +25,7 @@
     replTest.awaitSecondaryNodes();
     replTest.awaitReplication();
 
-    nodes.forEach(function(node) {
-        // Pause bgsync so it doesn't keep trying to sync from other nodes.
-        assert.commandWorked(
-            node.adminCommand({configureFailPoint: 'pauseRsBgSyncProducer', mode: 'alwaysOn'}));
-        // Stop oplog fetcher so that the ongoing fetcher doesn't return anything new.
-        assert.commandWorked(
-            node.adminCommand({configureFailPoint: 'stopOplogFetcher', mode: 'alwaysOn'}));
-    });
+    stopServerReplication(secondaries);
 
     jsTest.log("inserting shardIdentity document to primary that shouldn't replicate");
 
@@ -70,18 +65,7 @@
 
     // Disable the fail point so that the elected node can exit drain mode and finish becoming
     // primary.
-    secondaries.forEach(function(secondary) {
-        assert.commandWorked(
-            secondary.adminCommand({configureFailPoint: 'stopOplogFetcher', mode: 'off'}));
-        try {
-            assert.commandWorked(
-                secondary.adminCommand({configureFailPoint: 'pauseRsBgSyncProducer', mode: 'off'}));
-        } catch (e) {
-            // Enabling bgsync producer may cause rollback, which will close all connections
-            // including the one sending "configureFailPoint".
-            print("got exception when disabling fail point 'pauseRsBgSyncProducer': " + e);
-        }
-    });
+    restartServerReplication(secondaries);
 
     // Wait for a new healthy primary
     var newPriConn = replTest.getPrimary();
diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp
index 7a5a088aae5..a4943f3f8a4 100644
--- a/src/mongo/db/repl/bgsync.cpp
+++ b/src/mongo/db/repl/bgsync.cpp
@@ -108,8 +108,6 @@ size_t getSize(const BSONObj& o) {
 }
 }  // namespace
 
-MONGO_FP_DECLARE(pauseRsBgSyncProducer);
-
 // Failpoint which causes rollback to hang before starting.
 MONGO_FP_DECLARE(rollbackHangBeforeStart);
 
@@ -259,12 +257,19 @@ void BackgroundSync::_runProducer() {
 }
 
 void BackgroundSync::_produce(OperationContext* txn) {
+    if (MONGO_FAIL_POINT(stopReplProducer)) {
+        // This log output is used in js tests so please leave it.
+        log() << "bgsync - stopReplProducer fail point "
+                 "enabled. Blocking until fail point is disabled.";
 
-    while (MONGO_FAIL_POINT(pauseRsBgSyncProducer)) {
-        if (inShutdown()) {
-            return;
-        }
-        sleepmillis(10);
+        // TODO(SERVER-27120): Remove the return statement and uncomment the while loop.
+        // Currently we cannot block here or we prevent primaries from being fully elected since
+        // we'll never call _signalNoNewDataForApplier.
+        //        while (MONGO_FAIL_POINT(stopReplProducer) && !inShutdown()) {
+        //            mongo::sleepsecs(1);
+        //        }
+        mongo::sleepsecs(1);
+        return;
     }
 
     // this oplog reader does not do a handshake because we don't want the server it's syncing
@@ -383,10 +388,6 @@ void BackgroundSync::_produce(OperationContext* txn) {
         StorageInterface::get(txn)->setAppliedThrough(txn, _replCoord->getMyLastAppliedOpTime());
     }
 
-    if (MONGO_FAIL_POINT(stopOplogFetcher)) {
-        return;
-    }
-
     // "lastFetched" not used. Already set in _enqueueDocuments.
     Status fetcherReturnStatus = Status::OK();
     DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState(
diff --git a/src/mongo/db/repl/data_replicator.cpp b/src/mongo/db/repl/data_replicator.cpp
index f645e6a385a..4c25c9d43c9 100644
--- a/src/mongo/db/repl/data_replicator.cpp
+++ b/src/mongo/db/repl/data_replicator.cpp
@@ -753,7 +753,7 @@ void DataReplicator::_oplogFetcherCallback(const Status& oplogFetcherFinishStatu
     // It is up to the DatabasesCloner and MultiApplier to determine if they can proceed without any
     // additional data going into the oplog buffer.
     // It is not common for the OplogFetcher to return with an OK status. The only time it returns
-    // an OK status is when the 'stopOplogFetcher' fail point is enabled, which causes the
+    // an OK status is when the 'stopReplProducer' fail point is enabled, which causes the
     // OplogFetcher to ignore the current sync source response and return early.
     if (status.isOK()) {
         log() << "Finished fetching oplog fetching early. Last fetched optime and hash: "
diff --git a/src/mongo/db/repl/oplog_fetcher.cpp b/src/mongo/db/repl/oplog_fetcher.cpp
index 0bba80369cd..f976b9ac18d 100644
--- a/src/mongo/db/repl/oplog_fetcher.cpp
+++ b/src/mongo/db/repl/oplog_fetcher.cpp
@@ -51,7 +51,7 @@ namespace repl {
 
 Seconds OplogFetcher::kDefaultProtocolZeroAwaitDataTimeout(2);
 
-MONGO_FP_DECLARE(stopOplogFetcher);
+MONGO_FP_DECLARE(stopReplProducer);
 
 namespace {
 
@@ -423,12 +423,10 @@ void OplogFetcher::_callback(const Fetcher::QueryResponseStatus& result,
     }
 
     // Stop fetching and return on fail point.
-    // This fail point is intended to make the oplog fetcher ignore the downloaded batch of
-    // operations and not error out.
-    if (MONGO_FAIL_POINT(stopOplogFetcher)) {
+    // This fail point makes the oplog fetcher ignore the downloaded batch of operations and not
+    // error out.
+    if (MONGO_FAIL_POINT(stopReplProducer)) {
         _finishCallback(Status::OK());
-        // Wait for a while, otherwise, it will keep busy waiting.
-        sleepmillis(100);
         return;
     }
 
diff --git a/src/mongo/db/repl/oplog_fetcher.h b/src/mongo/db/repl/oplog_fetcher.h
index 987e33a1df9..0078471bf85 100644
--- a/src/mongo/db/repl/oplog_fetcher.h
+++ b/src/mongo/db/repl/oplog_fetcher.h
@@ -48,7 +48,7 @@
 namespace mongo {
 namespace repl {
 
-MONGO_FP_FORWARD_DECLARE(stopOplogFetcher);
+MONGO_FP_FORWARD_DECLARE(stopReplProducer);
 
 /**
  * Used to keep track of the optime and hash of the last fetched operation.
author	Spencer T Brody <spencer@mongodb.com>	2017-01-17 17:06:52 -0500
committer	Spencer T Brody <spencer@mongodb.com>	2017-01-23 16:18:50 -0500
commit	aca85db93035c35887f7ecf5a64f972bb0a2c82a (patch)
tree	2deb4fe13f348a66e8905234482ffa4973f763e5
parent	f0cf61dbcdc569e1b3642097787068c7b1139273 (diff)
download	mongo-aca85db93035c35887f7ecf5a64f972bb0a2c82a.tar.gz