Revert "SERVER-26360 Node should be able to leave RECOVERING after going too stale"

This reverts commit 31a2a5bcec525c9dc62cfdd06d126673f507c8df.
author: Ramon Fernandez <ramon@mongodb.com> 2017-04-20 17:43:39 -0400
committer: Ramon Fernandez <ramon@mongodb.com> 2017-04-20 17:43:39 -0400
commit: 94a762ba470c21db4db14afb692c1b7dc5f4c69b (patch)
tree: 64d34c2be2d14dbba211f3f39670eb3adfccfe23
parent: 8d35bbbf1c1c08969eb40fee5144ff01b503273e (diff)
download: mongo-94a762ba470c21db4db14afb692c1b7dc5f4c69b.tar.gz
4 files changed, 139 insertions, 193 deletions
diff --git a/jstests/replsets/too_stale_secondary.js b/jstests/replsets/too_stale_secondary.js
deleted file mode 100644
index 369662e5f16..00000000000
--- a/jstests/replsets/too_stale_secondary.js
+++ /dev/null
@@ -1,140 +0,0 @@
-/**
- * This test ensures that a secondary that has gone "too stale" (i.e. cannot find another node with
- * a common oplog point) will transition to RECOVERING state, stay in RECOVERING after restart, and
- * transition back to SECONDARY once it finds a sync source with a common oplog point.
- *
- * Note: This test requires persistence in order for a restarted node with a stale oplog to stay in
- * the RECOVERING state. A restarted node with an ephemeral storage engine will not have an oplog
- * upon restart, so will immediately resync.
- *
- * @tags: [requires_persistence]
- *
- * Replica Set Setup:
- *
- * Node 0 (PRIMARY)     : Small Oplog
- * Node 1 (SECONDARY)   : Large Oplog
- * Node 2 (SECONDARY)   : Small Oplog
- *
- * 1:  Insert one document on the primary (Node 0) and ensure it is replicated.
- * 2:  Stop node 2.
- * 3:  Wait until Node 2 is down.
- * 4:  Overflow the primary's oplog.
- * 5:  Stop Node 1 and restart Node 2.
- * 6:  Wait for Node 2 to transition to RECOVERING (it should be too stale).
- * 7:  Stop and restart Node 2.
- * 8:  Wait for Node 2 to transition to RECOVERING (its oplog should remain stale after restart).
- * 9:  Restart Node 1, which should have the full oplog history.
- * 10: Wait for Node 2 to leave RECOVERING and transition to SECONDARY.
- *
- */
-
-(function() {
-    load('jstests/replsets/rslib.js');
-
-    "use strict";
-
-    function getFirstOplogEntry(conn) {
-        return conn.getDB('local').oplog.rs.find().sort({ts: 1}).limit(1)[0];
-    }
-
-    /**
-     * Overflows the oplog of a given node.
-     *
-     * To detect oplog overflow, we continuously insert large documents until we
-     * detect that the first entry of the oplog is no longer the same as when we started. This
-     * implies that the oplog attempted to grow beyond its maximum size i.e. it
-     * has overflowed/rolled over.
-     *
-     * Each document will be inserted with a writeConcern given by 'writeConcern'.
-     *
-     */
-    function overflowOplog(conn, db, writeConcern) {
-        var firstOplogEntry = getFirstOplogEntry(primary);
-        var collName = "overflow";
-
-        // Keep inserting large documents until the oplog rolls over.
-        const largeStr = new Array(32 * 1024).join('aaaaaaaa');
-        while (bsonWoCompare(getFirstOplogEntry(conn), firstOplogEntry) === 0) {
-            assert.writeOK(
-                db[collName].insert({data: largeStr}, {writeConcern: {w: writeConcern}}));
-        }
-    }
-
-    var testName = "too_stale_secondary";
-
-    var smallOplogSizeMB = 1;
-    var bigOplogSizeMB = 1000;
-
-    // Node 0 is given a small oplog so we can overflow it. Node 1's large oplog allows it to store
-    // all entries comfortably without overflowing, so that Node 2 can eventually use it as a sync
-    // source after it goes too stale.
-    var replTest = new ReplSetTest({
-        name: testName,
-        nodes: [
-            {oplogSize: smallOplogSizeMB},
-            {oplogSize: bigOplogSizeMB},
-            {oplogSize: smallOplogSizeMB}
-        ]
-    });
-
-    var nodes = replTest.startSet();
-    replTest.initiate({
-        _id: testName,
-        members: [
-            {_id: 0, host: nodes[0].host},
-            {_id: 1, host: nodes[1].host, priority: 0},
-            {_id: 2, host: nodes[2].host, priority: 0}
-        ]
-    });
-
-    var dbName = testName;
-    var collName = "test";
-
-    jsTestLog("Wait for Node 0 to become the primary.");
-    replTest.waitForState(replTest.nodes[0], ReplSetTest.State.PRIMARY);
-
-    var primary = replTest.getPrimary();
-    var primaryTestDB = primary.getDB(dbName);
-
-    jsTestLog("1: Insert one document on the primary (Node 0) and ensure it is replicated.");
-    assert.writeOK(primaryTestDB[collName].insert({a: 1}, {writeConcern: {w: 3}}));
-
-    jsTestLog("2: Stop Node 2.");
-    replTest.stop(2);
-
-    jsTestLog("3: Wait until Node 2 is down.");
-    replTest.waitForState(replTest.nodes[2], ReplSetTest.State.DOWN);
-
-    var firstOplogEntryNode1 = getFirstOplogEntry(replTest.nodes[1]);
-
-    jsTestLog("4: Overflow the primary's oplog.");
-    overflowOplog(primary, primaryTestDB, 2);
-
-    // Make sure that Node 1's oplog didn't overflow.
-    assert.eq(firstOplogEntryNode1,
-              getFirstOplogEntry(replTest.nodes[1]),
-              "Node 1's oplog overflowed unexpectedly.");
-
-    jsTestLog("5: Stop Node 1 and restart Node 2.");
-    replTest.stop(1);
-    replTest.restart(2);
-
-    jsTestLog("6: Wait for Node 2 to transition to RECOVERING (it should be too stale).");
-    replTest.waitForState(replTest.nodes[2], ReplSetTest.State.RECOVERING);
-
-    jsTestLog("7: Stop and restart Node 2.");
-    replTest.stop(2);
-    replTest.restart(2);
-
-    jsTestLog(
-        "8: Wait for Node 2 to transition to RECOVERING (its oplog should remain stale after restart)");
-    replTest.waitForState(replTest.nodes[2], ReplSetTest.State.RECOVERING);
-
-    jsTestLog("9: Restart Node 1, which should have the full oplog history.");
-    replTest.restart(1);
-
-    jsTestLog("10: Wait for Node 2 to leave RECOVERING and transition to SECONDARY.");
-    replTest.waitForState(replTest.nodes[2], ReplSetTest.State.SECONDARY);
-
-    replTest.stopSet();
-}());
-\ No newline at end of file
diff --git a/jstests/replsets/toostale.js b/jstests/replsets/toostale.js
new file mode 100644
index 00000000000..4f1a1057757
--- /dev/null
+++ b/jstests/replsets/toostale.js
@@ -0,0 +1,126 @@
+// This tests that:
+// * stale members get into state 3 (recovering)
+// * they stay in state 3 after restarting
+// * they can recover and go into state 2 if someone less up-to-date becomes primary
+//
+// This test requires persistence in order for a restarted node with a stale oplog to stay in the
+// RECOVERING state. A restarted node with an ephemeral storage engine will not have an oplog upon
+// restart, so will immediately resync.
+// @tags: [requires_persistence]
+
+/**
+ * 1: initial insert
+ * 2: initial sync
+ * 3: blind s2
+ * 4: overflow oplog
+ * 5: unblind s2
+ * 6: check s2.state == 3
+ * 7: restart s2
+ * 8: check s2.state == 3
+ */
+
+var w = 0;
+var wait = function(f) {
+    w++;
+    var n = 0;
+    while (!f()) {
+        if (n % 4 == 0)
+            print("toostale.js waiting " + w);
+        if (++n == 4) {
+            print("" + f);
+        }
+        assert(n < 200, 'tried 200 times, giving up');
+        sleep(1000);
+    }
+};
+
+var reconnect = function(a) {
+    wait(function() {
+        try {
+            a.bar.stats();
+            return true;
+        } catch (e) {
+            print(e);
+            return false;
+        }
+    });
+};
+
+var name = "toostale";
+var replTest = new ReplSetTest({name: name, nodes: 3, oplogSize: 5});
+var host = getHostName();
+
+var nodes = replTest.startSet();
+replTest.initiate({
+    _id: name,
+    members: [
+        {_id: 0, host: host + ":" + replTest.ports[0], priority: 2},
+        {_id: 1, host: host + ":" + replTest.ports[1], arbiterOnly: true},
+        {_id: 2, host: host + ":" + replTest.ports[2], priority: 0}
+    ]
+});
+var master = replTest.getPrimary();
+var mdb = master.getDB("foo");
+
+print("1: initial insert");
+mdb.foo.save({a: 1000});
+
+print("2: initial sync");
+replTest.awaitReplication();
+
+print("3: stop s2");
+replTest.stop(2);
+print("waiting until the master knows the slave is blind");
+assert.soon(function() {
+    return master.getDB("admin").runCommand({replSetGetStatus: 1}).members[2].health == 0;
+});
+print("okay");
+
+print("4: overflow oplog");
+reconnect(master.getDB("local"));
+var count = master.getDB("local").oplog.rs.find().itcount();
+var prevCount = -1;
+while (count > prevCount) {
+    print("inserting 1000");
+    var bulk = mdb.bar.initializeUnorderedBulkOp();
+    for (var i = 0; i < 1000; i++) {
+        bulk.insert({x: i, date: new Date(), str: "safkaldmfaksndfkjansfdjanfjkafa"});
+    }
+    assert.writeOK(bulk.execute());
+
+    prevCount = count;
+    replTest.awaitReplication();
+    count = master.getDB("local").oplog.rs.find().itcount();
+    print("count: " + count + " prev: " + prevCount);
+}
+
+print("5: restart s2");
+replTest.restart(2);
+print("waiting until the master knows the slave is not blind");
+assert.soon(function() {
+    return master.getDB("admin").runCommand({replSetGetStatus: 1}).members[2].health != 0;
+});
+print("okay");
+
+print("6: check s2.state == 3");
+var goStale = function() {
+    wait(function() {
+        var status = master.getDB("admin").runCommand({replSetGetStatus: 1});
+        printjson(status);
+        return status.members[2].state == 3;
+    });
+};
+goStale();
+
+print("7: restart s2");
+replTest.stop(2);
+replTest.restart(2);
+
+print("8: check s2.state == 3");
+assert.soon(function() {
+    var status = master.getDB("admin").runCommand({replSetGetStatus: 1});
+    printjson(status);
+    return status.members && status.members[2].state == 3;
+});
+
+replTest.stop(0);
diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp
index 41781d2fcc9..308e6c7ecce 100644
--- a/src/mongo/db/repl/bgsync.cpp
+++ b/src/mongo/db/repl/bgsync.cpp
@@ -315,20 +315,10 @@ void BackgroundSync::_produce(OperationContext* opCtx) {
             return;
         }
 
-        // We only need to mark ourselves as too stale once.
-        if (_tooStale) {
-            return;
-        }
-
-        // Mark yourself as too stale.
-        _tooStale = true;
-
         error() << "too stale to catch up -- entering maintenance mode";
         log() << "Our newest OpTime : " << lastOpTimeFetched;
         log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen;
         log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
-
-        // Activate maintenance mode and transition to RECOVERING.
         auto status = _replCoord->setMaintenanceMode(true);
         if (!status.isOK()) {
             warning() << "Failed to transition into maintenance mode: " << status;
@@ -363,20 +353,6 @@ void BackgroundSync::_produce(OperationContext* opCtx) {
         return;
     }
 
-    // If we find a good sync source after having gone too stale, disable maintenance mode so we can
-    // transition to SECONDARY.
-    if (_tooStale) {
-
-        _tooStale = false;
-
-        log() << "No longer too stale. Able to sync from " << _syncSourceHost;
-
-        auto status = _replCoord->setMaintenanceMode(false);
-        if (!status.isOK()) {
-            warning() << "Failed to leave maintenance mode: " << status;
-        }
-    }
-
     long long lastHashFetched;
     {
         stdx::lock_guard<stdx::mutex> lock(_mutex);
diff --git a/src/mongo/db/repl/bgsync.h b/src/mongo/db/repl/bgsync.h
index 6d068967723..ead036c79d2 100644
--- a/src/mongo/db/repl/bgsync.h
+++ b/src/mongo/db/repl/bgsync.h
@@ -174,47 +174,31 @@ private:
     // A pointer to the replication coordinator external state.
     ReplicationCoordinatorExternalState* _replicationCoordinatorExternalState;
 
-    /**
-      * All member variables are labeled with one of the following codes indicating the
-      * synchronization rules for accessing them:
-      *
-      * (PR) Completely private to BackgroundSync. Can be read or written to from within the main
-      *      BackgroundSync thread without synchronization. Shouldn't be accessed outside of this
-      *      thread.
-      *
-      * (S)  Self-synchronizing; access in any way from any context.
-      *
-      * (M)  Reads and writes guarded by _mutex
-      *
-     */
+    // _mutex protects all of the class variables declared below.
+    //
+    // Never hold bgsync mutex when trying to acquire the ReplicationCoordinator mutex.
+    mutable stdx::mutex _mutex;
 
-    // Protects member data of BackgroundSync.
-    // Never hold the BackgroundSync mutex when trying to acquire the ReplicationCoordinator mutex.
-    mutable stdx::mutex _mutex;  // (S)
+    OpTime _lastOpTimeFetched;
 
-    OpTime _lastOpTimeFetched;  // (M)
-
-    // lastFetchedHash is used to match ops to determine if we need to rollback, when a secondary.
-    long long _lastFetchedHash = 0LL;  // (M)
+    // lastFetchedHash is used to match ops to determine if we need to rollback, when
+    // a secondary.
+    long long _lastFetchedHash = 0LL;
 
     // Thread running producerThread().
-    std::unique_ptr<stdx::thread> _producerThread;  // (M)
+    std::unique_ptr<stdx::thread> _producerThread;
 
     // Set to true if shutdown() has been called.
-    bool _inShutdown = false;  // (M)
-
-    // Flag that marks whether a node's oplog has no common point with any
-    // potential sync sources.
-    bool _tooStale = false;  // (PR)
+    bool _inShutdown = false;
 
-    ProducerState _state = ProducerState::Starting;  // (M)
+    ProducerState _state = ProducerState::Starting;
 
-    HostAndPort _syncSourceHost;  // (M)
+    HostAndPort _syncSourceHost;
 
     // Current sync source resolver validating sync source candidates.
     // Pointer may be read on any thread that locks _mutex or unlocked on the BGSync thread. It can
     // only be written to by the BGSync thread while holding _mutex.
-    std::unique_ptr<SyncSourceResolver> _syncSourceResolver;  // (M)
+    std::unique_ptr<SyncSourceResolver> _syncSourceResolver;
 
     // Current oplog fetcher tailing the oplog on the sync source.
     std::unique_ptr<OplogFetcher> _oplogFetcher;
author	Ramon Fernandez <ramon@mongodb.com>	2017-04-20 17:43:39 -0400
committer	Ramon Fernandez <ramon@mongodb.com>	2017-04-20 17:43:39 -0400
commit	94a762ba470c21db4db14afb692c1b7dc5f4c69b (patch)
tree	64d34c2be2d14dbba211f3f39670eb3adfccfe23
parent	8d35bbbf1c1c08969eb40fee5144ff01b503273e (diff)
download	mongo-94a762ba470c21db4db14afb692c1b7dc5f4c69b.tar.gz