From 320e2d28396c250dfd69640dcf865dff50ea0b55 Mon Sep 17 00:00:00 2001
From: Jason Chan <jason.chan@mongodb.com>
Date: Wed, 28 Apr 2021 19:53:03 +0000
Subject: SERVER-55305 Add new step to replication rollback to restore the txns
 table to be consistent with the stableTimestamp

---
 ...d_txn_table_updates_during_oplog_application.js | 183 +++++++++++++++++++++
 ...sced_txn_table_updates_from_vectored_inserts.js | 115 +++++++++++++
 2 files changed, 298 insertions(+)
 create mode 100644 jstests/replsets/rollback_with_coalesced_txn_table_updates_during_oplog_application.js
 create mode 100644 jstests/replsets/rollback_with_coalesced_txn_table_updates_from_vectored_inserts.js

(limited to 'jstests')

diff --git a/jstests/replsets/rollback_with_coalesced_txn_table_updates_during_oplog_application.js b/jstests/replsets/rollback_with_coalesced_txn_table_updates_during_oplog_application.js
new file mode 100644
index 00000000000..39ceaa77713
--- /dev/null
+++ b/jstests/replsets/rollback_with_coalesced_txn_table_updates_during_oplog_application.js
@@ -0,0 +1,183 @@
+/**
+ * Tests that the rollback procedure will update the 'config.transactions' table to be consistent
+ * with the node data at the 'stableTimestamp', specifically in the case where multiple derived ops
+ * to the 'config.transactions' table were coalesced into a single operation during secondary oplog
+ * application.
+ * We also test that if a node crashes after oplog truncation during rollback, the update made to
+ * the 'config.transactions' table is persisted on startup.
+ *
+ * @tags: [requires_persistence]
+ */
+
+(function() {
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/write_concern_util.js");
+
+const oplogApplierBatchSize = 100;
+
+function runTest(crashAfterRollbackTruncation) {
+    const rst = new ReplSetTest({
+        nodes: {
+            n0: {},
+            // Set the 'syncdelay' to 1s to speed up checkpointing. Also explicitly set the batch
+            // size for oplog application to ensure the number of retryable write statements being
+            // made majority committed isn't a multiple of it.
+            n1: {syncdelay: 1, setParameter: {replBatchLimitOperations: oplogApplierBatchSize}},
+            // Set the bgSyncOplogFetcherBatchSize to 1 oplog entry to guarantee replication
+            // progress with the stopReplProducerOnDocument failpoint.
+            n2: {setParameter: {bgSyncOplogFetcherBatchSize: 1}},
+            n3: {setParameter: {bgSyncOplogFetcherBatchSize: 1}},
+            n4: {setParameter: {bgSyncOplogFetcherBatchSize: 1}},
+        },
+        // Force secondaries to sync from the primary to guarantee replication progress with the
+        // stopReplProducerOnDocument failpoint. Also disable primary catchup because some
+        // replicated retryable write statements are intentionally not being made majority
+        // committed.
+        settings: {chainingAllowed: false, catchUpTimeoutMillis: 0},
+    });
+    rst.startSet();
+    rst.initiate();
+
+    const primary = rst.getPrimary();
+    const ns = "test.retryable_write_partial_rollback";
+    assert.commandWorked(
+        primary.getCollection(ns).insert({_id: 0, counter: 0}, {writeConcern: {w: 5}}));
+
+    const [secondary1, secondary2, secondary3, secondary4] = rst.getSecondaries();
+
+    // Disable replication on all of the secondaries to manually control the replication progress.
+    const stopReplProducerFailpoints = [secondary1, secondary2, secondary3, secondary4].map(
+        conn => configureFailPoint(conn, 'stopReplProducer'));
+
+    // While replication is still entirely disabled, additionally disable replication partway into
+    // the retryable write on all but the first secondary. The idea is that while secondary1 will
+    // apply all of the oplog entries in a single batch, the other secondaries will only apply up to
+    // counterMajorityCommitted oplog entries.
+    const counterTotal = oplogApplierBatchSize;
+    const counterMajorityCommitted = counterTotal - 2;
+    const stopReplProducerOnDocumentFailpoints = [secondary2, secondary3, secondary4].map(
+        conn => configureFailPoint(conn,
+                                   'stopReplProducerOnDocument',
+                                   {document: {"diff.u.counter": counterMajorityCommitted + 1}}));
+
+    const lsid = ({id: UUID()});
+
+    assert.commandWorked(primary.getCollection(ns).runCommand("update", {
+        updates: Array.from({length: counterTotal}, () => ({q: {_id: 0}, u: {$inc: {counter: 1}}})),
+        lsid,
+        txnNumber: NumberLong(1),
+    }));
+
+    const stmtMajorityCommitted = primary.getCollection("local.oplog.rs")
+                                      .findOne({ns, "o.diff.u.counter": counterMajorityCommitted});
+    assert.neq(null, stmtMajorityCommitted);
+
+    for (const fp of stopReplProducerFailpoints) {
+        fp.off();
+
+        // Wait for the secondary to have applied through the counterMajorityCommitted retryable
+        // write statement. We do this for each secondary individually, starting with secondary1, to
+        // guarantee that secondary1 will advance its stable_timestamp when learning of the other
+        // secondaries also having applied through counterMajorityCommitted.
+        assert.soon(() => {
+            const {optimes: {appliedOpTime, durableOpTime}} =
+                assert.commandWorked(fp.conn.adminCommand({replSetGetStatus: 1}));
+
+            print(`${fp.conn.host}: ${tojsononeline({
+                appliedOpTime,
+                durableOpTime,
+                stmtMajorityCommittedTimestamp: stmtMajorityCommitted.ts
+            })}`);
+
+            return bsonWoCompare(appliedOpTime.ts, stmtMajorityCommitted.ts) >= 0 &&
+                bsonWoCompare(durableOpTime.ts, stmtMajorityCommitted.ts) >= 0;
+        });
+    }
+
+    // Wait for secondary1 to have advanced its stable_timestamp.
+    assert.soon(() => {
+        const {lastStableRecoveryTimestamp} =
+            assert.commandWorked(secondary1.adminCommand({replSetGetStatus: 1}));
+
+        print(`${secondary1.host}: ${tojsononeline({
+            lastStableRecoveryTimestamp,
+            stmtMajorityCommittedTimestamp: stmtMajorityCommitted.ts
+        })}`);
+
+        return bsonWoCompare(lastStableRecoveryTimestamp, stmtMajorityCommitted.ts) >= 0;
+    });
+
+    // Step up one of the other secondaries and do a write which becomes majority committed to force
+    // secondary1 to go into rollback.
+    rst.freeze(secondary1);
+    assert.commandWorked(secondary2.adminCommand({replSetStepUp: 1}));
+    rst.freeze(primary);
+    rst.awaitNodesAgreeOnPrimary(undefined, undefined, secondary2);
+
+    let hangAfterTruncate;
+    if (crashAfterRollbackTruncation) {
+        hangAfterTruncate = configureFailPoint(secondary1, 'hangAfterOplogTruncationInRollback');
+    }
+
+    for (const fp of stopReplProducerOnDocumentFailpoints) {
+        fp.off();
+    }
+
+    // Wait for secondary2 to be a writable primary.
+    rst.getPrimary();
+
+    // Do a write which becomes majority committed and wait for secondary1 to complete its rollback.
+    assert.commandWorked(
+        secondary2.getCollection("test.dummy").insert({}, {writeConcern: {w: 'majority'}}));
+
+    if (crashAfterRollbackTruncation) {
+        // Entering rollback will close connections so we expect some network errors when waiting
+        // on the failpoint.
+        assert.soonNoExcept(() => {
+            hangAfterTruncate.wait();
+            return true;
+        }, `failed to wait for fail point ${hangAfterTruncate.failPointName}`);
+
+        // Crash the node after it performs oplog truncation.
+        rst.stop(secondary1, 9, {allowedExitCode: MongoRunner.EXIT_SIGKILL});
+        node = rst.restart(secondary1, {
+            "noReplSet": false,
+            setParameter: 'failpoint.stopReplProducer=' + tojson({mode: 'alwaysOn'})
+        });
+        rst.waitForState(secondary1, ReplSetTest.State.SECONDARY);
+        secondary1.setSecondaryOk();
+        // On startup, we expect to see the update persisted in the 'config.transactions' table.
+        let restoredDoc =
+            secondary1.getCollection('config.transactions').findOne({"_id.id": lsid.id});
+        assert.neq(null, restoredDoc);
+        secondary1.adminCommand({configureFailPoint: "stopReplProducer", mode: "off"});
+    }
+
+    // Reconnect to secondary1 after it completes its rollback and step it up to be the new primary.
+    rst.awaitNodesAgreeOnPrimary(undefined, undefined, secondary2);
+    assert.commandWorked(secondary1.adminCommand({replSetFreeze: 0}));
+    rst.stepUp(secondary1, {awaitWritablePrimary: false});
+
+    const docBeforeRetry = secondary1.getCollection(ns).findOne({_id: 0});
+    assert.eq(docBeforeRetry, {_id: 0, counter: counterMajorityCommitted});
+
+    assert.commandWorked(secondary1.getCollection(ns).runCommand("update", {
+        updates: Array.from({length: counterTotal}, () => ({q: {_id: 0}, u: {$inc: {counter: 1}}})),
+        lsid,
+        txnNumber: NumberLong(1),
+        writeConcern: {w: 5},
+    }));
+
+    const docAfterRetry = secondary1.getCollection(ns).findOne({_id: 0});
+    assert.eq(docAfterRetry, {_id: 0, counter: counterTotal});
+
+    rst.stopSet();
+}
+
+// Test the general scenario where we perform the appropriate update to the 'config.transactions'
+// table during rollback.
+runTest(false);
+// Extends the test to crash the secondary in the middle of rollback right after oplog truncation.
+// We assert that the update made to the 'config.transactions' table persisted on startup.
+runTest(true);
+})();
\ No newline at end of file
diff --git a/jstests/replsets/rollback_with_coalesced_txn_table_updates_from_vectored_inserts.js b/jstests/replsets/rollback_with_coalesced_txn_table_updates_from_vectored_inserts.js
new file mode 100644
index 00000000000..e6ca0056c31
--- /dev/null
+++ b/jstests/replsets/rollback_with_coalesced_txn_table_updates_from_vectored_inserts.js
@@ -0,0 +1,115 @@
+/**
+ * Tests that the rollback procedure will update the 'config.transactions' table to be consistent
+ * with the node data at the 'stableTimestamp', specifically in the case where multiple derived ops
+ * to the 'config.transactions' table were coalesced into a single operation when performing
+ * vectored inserts on the primary.
+ */
+
+(function() {
+load("jstests/libs/fail_point_util.js");
+load("jstests/libs/write_concern_util.js");
+
+const rst = new ReplSetTest({
+    nodes: {
+        // Set the syncdelay to 1s to speed up checkpointing.
+        n0: {syncdelay: 1},
+        // Set the bgSyncOplogFetcherBatchSize to 1 oplog entry to guarantee replication progress
+        // with the stopReplProducerOnDocument failpoint.
+        n1: {setParameter: {bgSyncOplogFetcherBatchSize: 1}},
+        n2: {setParameter: {bgSyncOplogFetcherBatchSize: 1}},
+    },
+    // Force secondaries to sync from the primary to guarantee replication progress with the
+    // stopReplProducerOnDocument failpoint. Also disable primary catchup because some replicated
+    // retryable write statements are intentionally not being made majority committed.
+    settings: {chainingAllowed: false, catchUpTimeoutMillis: 0},
+});
+rst.startSet();
+rst.initiate();
+
+const primary = rst.getPrimary();
+const ns = "test.retryable_write_coalesced_txn_updates";
+assert.commandWorked(primary.getCollection(ns).insert({_id: -1}, {writeConcern: {w: 3}}));
+
+const [secondary1, secondary2] = rst.getSecondaries();
+
+// Disable replication partway into the retryable write on all of the secondaries. The idea is that
+// while the primary will apply all of the writes in a single storage transaction, the secondaries
+// will only apply up to insertBatchMajorityCommitted oplog entries.
+const insertBatchTotal = 20;
+const insertBatchMajorityCommitted = insertBatchTotal - 2;
+const stopReplProducerOnDocumentFailpoints = [secondary1, secondary2].map(
+    conn => configureFailPoint(
+        conn, 'stopReplProducerOnDocument', {document: {"_id": insertBatchMajorityCommitted + 1}}));
+
+const lsid = ({id: UUID()});
+
+assert.commandWorked(primary.getCollection(ns).runCommand("insert", {
+    documents: Array.from({length: insertBatchTotal}, (_, i) => ({_id: i})),
+    lsid,
+    txnNumber: NumberLong(1),
+}));
+
+const stmtMajorityCommitted =
+    primary.getCollection("local.oplog.rs").findOne({ns, "o._id": insertBatchMajorityCommitted});
+assert.neq(null, stmtMajorityCommitted);
+
+// Wait for the primary to have advanced its stable_timestamp.
+assert.soon(() => {
+    const {lastStableRecoveryTimestamp} =
+        assert.commandWorked(primary.adminCommand({replSetGetStatus: 1}));
+
+    const wtStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1})).wiredTiger;
+    const latestMajority =
+        wtStatus["snapshot-window-settings"]["latest majority snapshot timestamp available"];
+
+    print(`${primary.host}: ${tojsononeline({
+        lastStableRecoveryTimestamp,
+        stmtMajorityCommittedTimestamp: stmtMajorityCommitted.ts,
+        "latest majority snapshot timestamp available": latestMajority
+    })}`);
+
+    // Make sure 'secondary1' has a 'lastApplied' optime equal to 'stmtMajorityCommitted.ts'.
+    // Otherwise, it can fail to win the election later.
+    const {optimes: {appliedOpTime}} =
+        assert.commandWorked(secondary1.adminCommand({replSetGetStatus: 1}));
+    print(`${secondary1.host}: ${tojsononeline({appliedOpTime})}`);
+
+    return bsonWoCompare(lastStableRecoveryTimestamp, stmtMajorityCommitted.ts) >= 0 &&
+        bsonWoCompare(appliedOpTime.ts, stmtMajorityCommitted.ts) >= 0;
+});
+
+// Step up one of the secondaries and do a write which becomes majority committed to force the
+// current primary to go into rollback.
+assert.commandWorked(secondary1.adminCommand({replSetStepUp: 1}));
+rst.freeze(primary);
+rst.awaitNodesAgreeOnPrimary(undefined, undefined, secondary1);
+
+for (const fp of stopReplProducerOnDocumentFailpoints) {
+    fp.off();
+}
+
+rst.getPrimary();  // Wait for secondary1 to be a writable primary.
+
+// Do a write which becomes majority committed and wait for the old primary to have completed its
+// rollback.
+assert.commandWorked(secondary1.getCollection("test.dummy").insert({}, {writeConcern: {w: 3}}));
+
+// Reconnect to the primary after it completes its rollback and step it up to be the primary again.
+rst.awaitNodesAgreeOnPrimary(undefined, undefined, secondary1);
+assert.commandWorked(primary.adminCommand({replSetFreeze: 0}));
+rst.stepUp(primary);
+
+print(`${primary.host} session txn record: ${
+    tojson(primary.getCollection("config.transactions").findOne({"_id.id": lsid.id}))}`);
+
+// Make sure we don't re-execute operations that have already been inserted by making sure we
+// we don't get a 'DuplicateKeyError'.
+assert.commandWorked(primary.getCollection(ns).runCommand("insert", {
+    documents: Array.from({length: insertBatchTotal}, (_, i) => ({_id: i})),
+    lsid,
+    txnNumber: NumberLong(1),
+    writeConcern: {w: 3},
+}));
+
+rst.stopSet();
+})();
\ No newline at end of file
-- 
cgit v1.2.1