8 files changed, 203 insertions, 17 deletions
diff --git a/buildscripts/resmokelib/testing/fixtures/replicaset.py b/buildscripts/resmokelib/testing/fixtures/replicaset.py
index 9e0fe34d4db..9d4878298f4 100644
--- a/buildscripts/resmokelib/testing/fixtures/replicaset.py
+++ b/buildscripts/resmokelib/testing/fixtures/replicaset.py
@@ -196,6 +196,7 @@ class ReplicaSetFixture(interface.ReplFixture):  # pylint: disable=too-many-inst
         """Wait for replica set tpo be ready."""
         self._await_primary()
         self._await_secondaries()
+        self._await_stable_checkpoint()
 
     def _await_primary(self):
         # Wait for the primary to be elected.
@@ -230,6 +231,63 @@ class ReplicaSetFixture(interface.ReplFixture):  # pylint: disable=too-many-inst
                 time.sleep(0.1)  # Wait a little bit before trying again.
             self.logger.info("Secondary on port %d is now available.", secondary.port)
 
+    def _await_stable_checkpoint(self):
+        # Since this method is called at startup we expect the first node to be primary even when
+        # self.all_nodes_electable is True.
+        primary = self.nodes[0]
+        primary_client = primary.mongo_client()
+        if self.auth_options is not None:
+            auth_db = primary_client[self.auth_options["authenticationDatabase"]]
+            auth_db.authenticate(self.auth_options["username"],
+                                 password=self.auth_options["password"],
+                                 mechanism=self.auth_options["authenticationMechanism"])
+        # Algorithm precondition: All nodes must be in primary/secondary state.
+        #
+        # 1) Perform a majority write. This will guarantee the primary updates its commit point
+        #    to the value of this write.
+        #
+        # 2) Perform a second write. This will guarantee that all nodes will update their commit
+        #    point to a time that is >= the previous write. That will trigger a stable checkpoint
+        #    on all nodes.
+        # TODO(SERVER-33248): Remove this block. We should not need to prod the replica set to
+        # advance the commit point if the commit point being lagged is sufficient to choose a
+        # sync source.
+        admin = primary_client.get_database(
+            "admin", write_concern=pymongo.write_concern.WriteConcern(w="majority"))
+        admin.command("appendOplogNote", data={"await_stable_checkpoint": 1})
+        admin.command("appendOplogNote", data={"await_stable_checkpoint": 2})
+
+        for node in self.nodes:
+            self.logger.info("Waiting for node on port %d to have a stable checkpoint.", node.port)
+            client = node.mongo_client(read_preference=pymongo.ReadPreference.SECONDARY)
+            client_admin = client["admin"]
+            if self.auth_options is not None:
+                client_auth_db = client[self.auth_options["authenticationDatabase"]]
+                client_auth_db.authenticate(self.auth_options["username"],
+                                            password=self.auth_options["password"],
+                                            mechanism=self.auth_options["authenticationMechanism"])
+
+            while True:
+                status = client_admin.command("replSetGetStatus")
+                # The `lastStableCheckpointTimestamp` field contains the timestamp of a previous
+                # checkpoint taken at a stable timestamp. At startup recovery, this field
+                # contains the timestamp reflected in the data. After startup recovery, it may
+                # be lagged and there may be a stable checkpoint at a newer timestamp.
+                last_stable = status.get("lastStableCheckpointTimestamp", None)
+
+                # A missing `lastStableCheckpointTimestamp` field indicates that the storage
+                # engine does not support "recover to a stable timestamp".
+                if not last_stable:
+                    break
+
+                # A null `lastStableCheckpointTimestamp` indicates that the storage engine supports
+                # "recover to a stable timestamp" but does not have a stable checkpoint yet.
+                if last_stable.time:
+                    self.logger.info("Node on port %d now has a stable checkpoint. Time: %s",
+                                     node.port, last_stable)
+                    break
+                time.sleep(0.1)  # Wait a little bit before trying again.
+
     def _do_teardown(self):
         self.logger.info("Stopping all members of the replica set...")
 
diff --git a/jstests/noPassthrough/auth_reject_mismatching_logical_times.js b/jstests/noPassthrough/auth_reject_mismatching_logical_times.js
index 804251c63a2..0d2a368ad49 100644
--- a/jstests/noPassthrough/auth_reject_mismatching_logical_times.js
+++ b/jstests/noPassthrough/auth_reject_mismatching_logical_times.js
@@ -45,7 +45,9 @@
     // Add shard with auth enabled.
     const rst = new ReplSetTest({nodes: 2});
     rst.startSet({keyFile: "jstests/libs/key1", shardsvr: ""});
-    rst.initiate();
+
+    // TODO: Wait for stable checkpoint when SERVER-32672 is fixed.
+    rst.initiateWithAnyNodeAsPrimary(null, "replSetInitiate", {doNotWaitForStableCheckpoint: true});
     assert.commandWorked(st.s.adminCommand({addShard: rst.getURL()}));
 
     const testDB = st.s.getDB("test");
diff --git a/jstests/noPassthrough/auto_retry_on_network_error.js b/jstests/noPassthrough/auto_retry_on_network_error.js
index 64c5ec6ae1f..b9bc5d6685b 100644
--- a/jstests/noPassthrough/auto_retry_on_network_error.js
+++ b/jstests/noPassthrough/auto_retry_on_network_error.js
@@ -43,7 +43,9 @@
 
     const rst = new ReplSetTest({nodes: 1});
     rst.startSet();
-    rst.initiate();
+
+    // awaitLastStableCheckpointTimestamp runs an 'appendOplogNote' command which is not retryable.
+    rst.initiateWithAnyNodeAsPrimary(null, "replSetInitiate", {doNotWaitForStableCheckpoint: true});
 
     const dbName = "test";
     const collName = "auto_retry";
diff --git a/jstests/noPassthrough/read_majority.js b/jstests/noPassthrough/read_majority.js
index 2cdf629927a..d4fbb75c367 100644
--- a/jstests/noPassthrough/read_majority.js
+++ b/jstests/noPassthrough/read_majority.js
@@ -40,7 +40,9 @@ load("jstests/libs/analyze_plan.js");
             }
         });
         replTest.startSet();
-        replTest.initiate();
+        // Cannot wait for a stable checkpoint with 'testingSnapshotBehaviorInIsolation' set.
+        replTest.initiateWithAnyNodeAsPrimary(
+            null, "replSetInitiate", {doNotWaitForStableCheckpoint: true});
 
         const session =
             replTest.getPrimary().getDB("test").getMongo().startSession({causalConsistency: false});
diff --git a/jstests/noPassthrough/read_majority_reads.js b/jstests/noPassthrough/read_majority_reads.js
index c8322a31c67..1f196856dd7 100644
--- a/jstests/noPassthrough/read_majority_reads.js
+++ b/jstests/noPassthrough/read_majority_reads.js
@@ -234,7 +234,9 @@
         }
     });
     replTest.startSet();
-    replTest.initiate();
+    // Cannot wait for a stable checkpoint with 'testingSnapshotBehaviorInIsolation' set.
+    replTest.initiateWithAnyNodeAsPrimary(
+        null, "replSetInitiate", {doNotWaitForStableCheckpoint: true});
 
     var mongod = replTest.getPrimary();
 
diff --git a/jstests/noPassthrough/timestamp_index_builds.js b/jstests/noPassthrough/timestamp_index_builds.js
index 3ebda4f3691..b55b1805e00 100644
--- a/jstests/noPassthrough/timestamp_index_builds.js
+++ b/jstests/noPassthrough/timestamp_index_builds.js
@@ -41,12 +41,7 @@
 
     let coll = getColl(rst.getPrimary());
 
-    // TODO Can be removed with SERVER-33165.
-    //
-    // Create a collection and perform two majority writes. This guarantees both nodes will have a
-    // stable timestamp.
-    assert.commandWorked(
-        coll.insert({}, {writeConcern: {w: "majority", wtimeout: rst.kDefaultTimeoutMS}}));
+    // Create a collection and wait for the stable timestamp to exceed its creation on both nodes.
     assert.commandWorked(
         coll.insert({}, {writeConcern: {w: "majority", wtimeout: rst.kDefaultTimeoutMS}}));
 
diff --git a/jstests/noPassthrough/wt_delayed_secondary_read_concern_majority.js b/jstests/noPassthrough/wt_delayed_secondary_read_concern_majority.js
index f8679f67d7f..83fe5283117 100644
--- a/jstests/noPassthrough/wt_delayed_secondary_read_concern_majority.js
+++ b/jstests/noPassthrough/wt_delayed_secondary_read_concern_majority.js
@@ -45,7 +45,9 @@
         conf.members[1].slaveDelay = 24 * 60 * 60;
 
         rst.startSet();
-        rst.initiateWithAnyNodeAsPrimary(conf);
+        // We cannot wait for a stable checkpoint due to the slaveDelay.
+        rst.initiateWithAnyNodeAsPrimary(
+            conf, "replSetInitiate", {doNotWaitForStableCheckpoint: true});
         var master = rst.getPrimary();  // Waits for PRIMARY state.
 
         // Reconfigure primary with a small cache size so less data needs to be
diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js
index 8f46b66c97d..7c200283189 100644
--- a/src/mongo/shell/replsettest.js
+++ b/src/mongo/shell/replsettest.js
@@ -142,9 +142,9 @@ var ReplSetTest = function(opts) {
         return self.liveNodes.master || false;
     }
 
-    function asCluster(conn, fn) {
-        if (self.keyFile) {
-            return authutil.asCluster(conn, self.keyFile, fn);
+    function asCluster(conn, fn, keyFileParam = self.keyFile) {
+        if (keyFileParam) {
+            return authutil.asCluster(conn, keyFileParam, fn);
         } else {
             return fn();
         }
@@ -649,7 +649,8 @@ var ReplSetTest = function(opts) {
             var primary = expectedPrimaryNodeId;
 
             for (var i = 0; i < nodes.length; i++) {
-                var replSetGetStatus = nodes[i].getDB("admin").runCommand({replSetGetStatus: 1});
+                var replSetGetStatus =
+                    assert.commandWorked(nodes[i].getDB("admin").runCommand({replSetGetStatus: 1}));
                 var nodesPrimary = -1;
                 for (var j = 0; j < replSetGetStatus.members.length; j++) {
                     if (replSetGetStatus.members[j].state === ReplSetTest.State.PRIMARY) {
@@ -841,7 +842,8 @@ var ReplSetTest = function(opts) {
      * aren't authorized to run replSetGetStatus.
      * TODO(SERVER-14017): remove this in favor of using initiate() everywhere.
      */
-    this.initiateWithAnyNodeAsPrimary = function(cfg, initCmd) {
+    this.initiateWithAnyNodeAsPrimary = function(
+        cfg, initCmd, {doNotWaitForStableCheckpoint: doNotWaitForStableCheckpoint = false} = {}) {
         var master = this.nodes[0].getDB("admin");
         var config = cfg || this.getReplSetConfig();
         var cmd = {};
@@ -926,7 +928,6 @@ var ReplSetTest = function(opts) {
             master = this.getPrimary();
             jsTest.authenticateNodes(this.nodes);
         }
-
         this.awaitSecondaryNodes();
 
         let shouldWaitForKeys = true;
@@ -990,6 +991,9 @@ var ReplSetTest = function(opts) {
             });
         }
 
+        if (!doNotWaitForStableCheckpoint) {
+            self.awaitLastStableCheckpointTimestamp();
+        }
     };
 
     /**
@@ -1122,6 +1126,125 @@ var ReplSetTest = function(opts) {
         return masterOpTime;
     };
 
+    /**
+     * This function waits for all nodes in this replica set to take a stable checkpoint. In order
+     * to be able to roll back a node must have a stable timestamp. In order to be able to restart
+     * and not go into resync right after initial sync, a node must have a stable checkpoint. By
+     * waiting for all nodes to report having a stable checkpoint, we ensure that both of these
+     * conditions are met and that our tests can run as expected. Beyond simply waiting, this
+     * function does writes to ensure that a stable checkpoint will be taken.
+     */
+    this.awaitLastStableCheckpointTimestamp = function() {
+        let rst = this;
+        let master = rst.getPrimary();
+        let id = tojson(rst.nodeList());
+
+        // Algorithm precondition: All nodes must be in primary/secondary state.
+        //
+        // 1) Perform a majority write. This will guarantee the primary updates its commit point
+        //    to the value of this write.
+        //
+        // 2) Perform a second write. This will guarantee that all nodes will update their commit
+        //    point to a time that is >= the previous write. That will trigger a stable checkpoint
+        //    on all nodes.
+        // TODO(SERVER-33248): Remove this block. We should not need to prod the replica set to
+        // advance the commit point if the commit point being lagged is sufficient to choose a
+        // sync source.
+        function advanceCommitPoint(master) {
+            // Shadow 'db' so that we can call 'advanceCommitPoint' directly on the primary node.
+            let db = master.getDB('admin');
+            const appendOplogNoteFn = function() {
+                assert.commandWorked(db.adminCommand({
+                    "appendOplogNote": 1,
+                    "data": {"awaitLastStableCheckpointTimestamp": 1},
+                    "writeConcern": {"w": "majority", "wtimeout": ReplSetTest.kDefaultTimeoutMS}
+                }));
+                assert.commandWorked(db.adminCommand(
+                    {"appendOplogNote": 1, "data": {"awaitLastStableCheckpointTimestamp": 2}}));
+            };
+
+            // TODO(SERVER-14017): Remove this extra sub-shell in favor of a cleaner authentication
+            // solution.
+            const masterId = "n" + rst.getNodeId(master);
+            const masterOptions = rst.nodeOptions[masterId] || {};
+            if (masterOptions.clusterAuthMode === "x509") {
+                print("AwaitLastStableCheckpointTimestamp: authenticating on separate shell " +
+                      "with x509 for " + id);
+                const subShellArgs = [
+                    'mongo',
+                    '--ssl',
+                    '--sslCAFile=' + masterOptions.sslCAFile,
+                    '--sslPEMKeyFile=' + masterOptions.sslPEMKeyFile,
+                    '--sslAllowInvalidHostnames',
+                    '--authenticationDatabase=$external',
+                    '--authenticationMechanism=MONGODB-X509',
+                    master.host,
+                    '--eval',
+                    `(${appendOplogNoteFn.toString()})();`
+                ];
+
+                const retVal = _runMongoProgram(...subShellArgs);
+                assert.eq(retVal, 0, 'mongo shell did not succeed with exit code 0');
+            } else {
+                if (masterOptions.clusterAuthMode) {
+                    print("AwaitLastStableCheckpointTimestamp: authenticating with " +
+                          masterOptions.clusterAuthMode + " for " + id);
+                }
+                asCluster(master, appendOplogNoteFn, masterOptions.keyFile);
+            }
+        }
+
+        print("AwaitLastStableCheckpointTimestamp: Beginning for " + id);
+
+        let replSetStatus = assert.commandWorked(master.adminCommand("replSetGetStatus"));
+        if (replSetStatus["configsvr"]) {
+            // Performing dummy replicated writes against a configsvr is hard, especially if auth
+            // is also enabled.
+            return;
+        }
+
+        rst.awaitNodesAgreeOnPrimary();
+        master = rst.getPrimary();
+
+        print("AwaitLastStableCheckpointTimestamp: ensuring the commit point advances for " + id);
+        advanceCommitPoint(master);
+
+        print("AwaitLastStableCheckpointTimestamp: Waiting for stable checkpoints for " + id);
+
+        assert.soonNoExcept(function() {
+            for (let node of rst.nodes) {
+                // The `lastStableCheckpointTimestamp` field contains the timestamp of a previous
+                // checkpoint taken at a stable timestamp. At startup recovery, this field
+                // contains the timestamp reflected in the data. After startup recovery, it may
+                // be lagged and there may be a stable checkpoint at a newer timestamp.
+                let res = assert.commandWorked(node.adminCommand({replSetGetStatus: 1}));
+
+                // Continue if we're connected to an arbiter.
+                if (res.myState === ReplSetTest.State.ARBITER) {
+                    continue;
+                }
+
+                // A missing `lastStableCheckpointTimestamp` field indicates that the storage
+                // engine does not support `recover to a stable timestamp`.
+                if (!res.hasOwnProperty("lastStableCheckpointTimestamp")) {
+                    continue;
+                }
+
+                // A null `lastStableCheckpointTimestamp` indicates that the storage engine supports
+                // "recover to a stable timestamp" but does not have a stable checkpoint yet.
+                if (res.lastStableCheckpointTimestamp.getTime() === 0) {
+                    print("AwaitLastStableCheckpointTimestamp: " + node.host +
+                          " does not have a stable checkpoint yet.");
+                    return false;
+                }
+            }
+
+            return true;
+        }, "Not all members have a stable checkpoint");
+
+        print("AwaitLastStableCheckpointTimestamp: Successfully took stable checkpoints on " + id);
+    };
+
     // Wait until the optime of the specified type reaches the primary's last applied optime.
     this.awaitReplication = function(timeout, secondaryOpTimeType) {
         timeout = timeout || self.kDefaultTimeoutMS;