SERVER-56631 Make sure retryable write pre-fetch phase can see the config.transactions record when committed snapshot is not at batch boundary.

author: Wenbin Zhu <wenbin.zhu@mongodb.com> 2021-06-08 21:00:45 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2021-06-08 21:48:33 +0000
commit: 18f91c4304086b0334d90c1d94a7d3c7225439bf (patch)
tree: b5d3f20e1b60791bb4be91008d3bc8e464053477 /jstests
parent: 2c1191cc8ff9ead1c600a85f7a1adbac8e09b1e1 (diff)
download: mongo-18f91c4304086b0334d90c1d94a7d3c7225439bf.tar.gz
2 files changed, 255 insertions, 11 deletions
diff --git a/jstests/replsets/tenant_migration_recipient_fetches_retryable_writes_entry_after_committed_snapshot.js b/jstests/replsets/tenant_migration_recipient_fetches_retryable_writes_entry_after_committed_snapshot.js
new file mode 100644
index 00000000000..fdeba25b524
--- /dev/null
+++ b/jstests/replsets/tenant_migration_recipient_fetches_retryable_writes_entry_after_committed_snapshot.js
@@ -0,0 +1,244 @@
+/**
+ * Tests that the tenant migration recipient correctly reads 'config.transactions' entries from a
+ * donor secondary. During secondary oplog application, updates to the same 'config.transactions'
+ * entry are coalesced in a single update of the most recent retryable write statement. If the
+ * majority committed snapshot of a secondary exists in the middle of a completed batch, then a
+ * recipient's majority read on 'config.transactions' can miss committed retryable writes at that
+ * majority commit point.
+ *
+ * @tags: [
+ *   requires_fcv_50,
+ *   requires_majority_read_concern,
+ *   incompatible_with_eft,
+ *   incompatible_with_windows_tls,
+ *   incompatible_with_macos, requires_persistence
+ * ]
+ */
+
+(function() {
+load("jstests/replsets/libs/tenant_migration_test.js");
+load("jstests/replsets/libs/tenant_migration_util.js");
+load("jstests/libs/fail_point_util.js");  // For configureFailPoint().
+load("jstests/libs/uuid_util.js");        // For extractUUIDFromObject().
+load("jstests/libs/write_concern_util.js");
+
+const getRecipientCurrOp = function(conn, migrationId) {
+    const res = conn.adminCommand({currentOp: true, desc: "tenant recipient migration"});
+    assert.eq(res.inprog.length, 1);
+    const currOp = res.inprog[0];
+    assert.eq(bsonWoCompare(currOp.instanceID, migrationId), 0);
+
+    return currOp;
+};
+
+const getDonorSyncSource = function(conn, migrationId) {
+    const currOp = getRecipientCurrOp(conn, migrationId);
+    return currOp.donorSyncSource;
+};
+
+const getStartFetchingDonorOpTime = function(conn, migrationId) {
+    const currOp = getRecipientCurrOp(conn, migrationId);
+    return currOp.startFetchingDonorOpTime;
+};
+
+const oplogApplierBatchSize = 50;
+
+const donorRst = new ReplSetTest({
+    nodes: {
+        n0: {},
+        // Set the 'syncdelay' to 1s to speed up checkpointing. Also explicitly set the batch
+        // size for oplog application to ensure the number of retryable write statements being
+        // made majority committed isn't a multiple of it.
+        n1: {syncdelay: 1, setParameter: {replBatchLimitOperations: oplogApplierBatchSize}},
+        // Set the bgSyncOplogFetcherBatchSize to 1 oplog entry to guarantee replication
+        // progress with the stopReplProducerOnDocument failpoint.
+        n2: {rsConfig: {priority: 0, hidden: true}, setParameter: {bgSyncOplogFetcherBatchSize: 1}},
+        n3: {rsConfig: {priority: 0, hidden: true}, setParameter: {bgSyncOplogFetcherBatchSize: 1}},
+        n4: {rsConfig: {priority: 0, hidden: true}, setParameter: {bgSyncOplogFetcherBatchSize: 1}},
+    },
+    // Force secondaries to sync from the primary to guarantee replication progress with the
+    // stopReplProducerOnDocument failpoint. Also disable primary catchup because some replicated
+    // retryable write statements are intentionally not being made majority committed.
+    settings: {chainingAllowed: false, catchUpTimeoutMillis: 0},
+    nodeOptions: Object.assign(TenantMigrationUtil.makeX509OptionsForTest().donor, {
+        setParameter: {
+            tenantMigrationExcludeDonorHostTimeoutMS: 30 * 1000,
+            // Allow non-timestamped reads on donor after migration completes for testing.
+            'failpoint.tenantMigrationDonorAllowsNonTimestampedReads': tojson({mode: 'alwaysOn'}),
+        }
+    }),
+});
+donorRst.startSet();
+donorRst.initiateWithHighElectionTimeout();
+const donorPrimary = donorRst.getPrimary();
+
+if (!TenantMigrationUtil.isFeatureFlagEnabled(donorPrimary)) {
+    jsTestLog("Skipping test because the tenant migrations feature flag is disabled");
+    donorRst.stopSet();
+    return;
+}
+
+const tenantMigrationTest = new TenantMigrationTest({name: jsTestName(), donorRst: donorRst});
+
+const recipientPrimary = tenantMigrationTest.getRecipientPrimary();
+const kTenantId = "testTenantId";
+const migrationId = UUID();
+const kDbName = tenantMigrationTest.tenantDB(kTenantId, "testDB");
+const kCollName = "retryable_write_secondary_oplog_application";
+const kNs = `${kDbName}.${kCollName}`;
+
+const migrationOpts = {
+    migrationIdString: extractUUIDFromObject(migrationId),
+    tenantId: kTenantId,
+    // The recipient needs to choose a donor secondary as sync source.
+    readPreference: {mode: 'secondary'},
+};
+
+const fpAfterConnectingTenantMigrationRecipientInstance = configureFailPoint(
+    recipientPrimary, "fpAfterConnectingTenantMigrationRecipientInstance", {action: "hang"});
+
+const fpBeforeWaitingForRetryableWritePreFetchMajorityCommitted = configureFailPoint(
+    recipientPrimary, "fpBeforeWaitingForRetryableWritePreFetchMajorityCommitted");
+
+// Start tenant migration and hang after recipient connects to donor sync source.
+jsTestLog(`Starting tenant migration: ${tojson(migrationOpts)}`);
+assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts));
+fpAfterConnectingTenantMigrationRecipientInstance.wait();
+
+// Recipient should connect to secondary1 as other secondaries are hidden.
+const [secondary1, secondary2, secondary3, secondary4] = donorRst.getSecondaries();
+const syncSourceSecondaryHost = getDonorSyncSource(recipientPrimary, migrationId);
+assert.eq(syncSourceSecondaryHost, secondary1.host);
+
+assert.commandWorked(
+    donorPrimary.getCollection(kNs).insert({_id: 0, counter: 0}, {writeConcern: {w: 5}}));
+
+// The default WC is majority and the donor replica set can't satisfy majority writes after we
+// stop replication on the secondaries.
+assert.commandWorked(donorPrimary.adminCommand(
+    {setDefaultRWConcern: 1, defaultWriteConcern: {w: 1}, writeConcern: {w: "majority"}}));
+donorRst.awaitReplication();
+
+// Disable replication on all of the secondaries to manually control the replication progress.
+const stopReplProducerFailpoints = [secondary1, secondary2, secondary3, secondary4].map(
+    conn => configureFailPoint(conn, 'stopReplProducer'));
+
+// While replication is still entirely disabled, additionally disable replication partway
+// into the retryable write on other secondaries. The idea is that while secondary1 will
+// apply all of the oplog entries in a single batch, other secondaries will only apply up
+// to counterMajorityCommitted oplog entries.
+const counterTotal = oplogApplierBatchSize;
+const counterMajorityCommitted = counterTotal - 2;
+jsTestLog(`counterTotal: ${counterTotal}, counterMajorityCommitted: ${counterMajorityCommitted}`);
+const stopReplProducerOnDocumentFailpoints = [secondary2, secondary3, secondary4].map(
+    conn => configureFailPoint(conn,
+                               'stopReplProducerOnDocument',
+                               {document: {"diff.u.counter": counterMajorityCommitted + 1}}));
+
+// Perform all the retryable write statements on donor primary.
+const lsid = ({id: UUID()});
+assert.commandWorked(donorPrimary.getCollection(kNs).runCommand("update", {
+    updates: Array.from({length: counterTotal}, () => ({q: {_id: 0}, u: {$inc: {counter: 1}}})),
+    lsid,
+    txnNumber: NumberLong(1),
+}));
+
+// Get the majority committed and last oplog entry of the respective retryable write statements.
+const stmtTotal =
+    donorPrimary.getCollection("local.oplog.rs").findOne({"o.diff.u.counter": counterTotal});
+const stmtMajorityCommitted = donorPrimary.getCollection("local.oplog.rs").findOne({
+    "o.diff.u.counter": counterMajorityCommitted
+});
+
+assert.neq(null, stmtTotal);
+assert.neq(null, stmtMajorityCommitted);
+jsTestLog(`stmtTotal timestamp: ${tojson(stmtTotal.ts)}`);
+jsTestLog(`stmtMajorityCommitted timestamp: ${tojson(stmtMajorityCommitted.ts)}`);
+
+for (const fp of stopReplProducerFailpoints) {
+    fp.off();
+    // Wait for secondary1 to have applied through the counterTotal retryable write statement and
+    // other secondaries applied through the counterMajorityCommitted retryable write statement,
+    // to guarantee that secondary1 will advance its stable_timestamp when learning of the other
+    // secondaries also having applied through counterMajorityCommitted.
+    assert.soon(() => {
+        const {optimes: {appliedOpTime, durableOpTime}} =
+            assert.commandWorked(fp.conn.adminCommand({replSetGetStatus: 1}));
+
+        print(`${fp.conn.host}: ${tojsononeline({
+            appliedOpTime,
+            durableOpTime,
+            stmtMajorityCommittedTimestamp: stmtMajorityCommitted.ts
+        })}`);
+
+        const stmtTarget = (fp.conn.host === secondary1.host) ? stmtTotal : stmtMajorityCommitted;
+
+        return bsonWoCompare(appliedOpTime.ts, stmtTarget.ts) >= 0 &&
+            bsonWoCompare(durableOpTime.ts, stmtTarget.ts) >= 0;
+    });
+}
+
+// Wait for secondary1 to have advanced its stable timestamp, and therefore updated the
+// committed snapshot.
+assert.soon(() => {
+    const {lastStableRecoveryTimestamp} =
+        assert.commandWorked(secondary1.adminCommand({replSetGetStatus: 1}));
+
+    print(`${secondary1.host}: ${tojsononeline({
+        lastStableRecoveryTimestamp,
+        stmtMajorityCommittedTimestamp: stmtMajorityCommitted.ts
+    })}`);
+
+    return bsonWoCompare(lastStableRecoveryTimestamp, stmtMajorityCommitted.ts) >= 0;
+});
+
+// Wait before tenant migration starts to wait for the retryable write pre-fetch result to be
+// majority committed.
+fpAfterConnectingTenantMigrationRecipientInstance.off();
+fpBeforeWaitingForRetryableWritePreFetchMajorityCommitted.wait();
+
+const startFetchingDonorOpTime = getStartFetchingDonorOpTime(recipientPrimary, migrationId);
+assert.eq(startFetchingDonorOpTime.ts, stmtMajorityCommitted.ts);
+
+// At this point, the recipient should have fetched retryable writes and put them into the
+// oplog buffer.
+const kOplogBufferNS = "config.repl.migration.oplog_" + migrationOpts.migrationIdString;
+const recipientOplogBuffer = recipientPrimary.getCollection(kOplogBufferNS);
+jsTestLog(`oplog buffer ns: ${kOplogBufferNS}`);
+
+// Number of entries fetched into oplog buffer is the majority committed count - 1 since we only
+// fetch entries that occur before startFetchingDonorOpTime, which is equal to the commit point.
+const cursor = recipientOplogBuffer.find();
+const expectedCount = counterMajorityCommitted - 1;
+assert.eq(
+    cursor.itcount(), expectedCount, `Incorrect number of oplog entries: ${cursor.toArray()}`);
+
+// Resume replication on all the secondaries and wait for migration to complete.
+for (const fp of stopReplProducerOnDocumentFailpoints) {
+    fp.off();
+}
+
+fpBeforeWaitingForRetryableWritePreFetchMajorityCommitted.off();
+
+TenantMigrationTest.assertCommitted(tenantMigrationTest.waitForMigrationToComplete(migrationOpts));
+assert.commandWorked(tenantMigrationTest.forgetMigration(migrationOpts.migrationIdString));
+
+// After migration, verify that if we perform the same retryable write statements on the recipient,
+// these statements will not be executed.
+let docAfterMigration = recipientPrimary.getCollection(kNs).findOne({_id: 0});
+assert.eq(docAfterMigration.counter, counterTotal);
+
+assert.commandWorked(recipientPrimary.getCollection(kNs).runCommand("update", {
+    updates: Array.from({length: counterTotal}, () => ({q: {_id: 0}, u: {$inc: {counter: 1}}})),
+    lsid,
+    txnNumber: NumberLong(1),
+}));
+
+// The second query should return the same result as first one, since the recipient should have
+// fetched the executed retryable writes from donor.
+docAfterMigration = recipientPrimary.getCollection(kNs).findOne({_id: 0});
+assert.eq(docAfterMigration.counter, counterTotal);
+
+donorRst.stopSet();
+tenantMigrationTest.stop();
+})();
diff --git a/jstests/replsets/tenant_migration_retryable_write_retry.js b/jstests/replsets/tenant_migration_retryable_write_retry.js
index 3ad8e8e78b7..52a392dc7ae 100644
--- a/jstests/replsets/tenant_migration_retryable_write_retry.js
+++ b/jstests/replsets/tenant_migration_retryable_write_retry.js
@@ -322,17 +322,6 @@ const aggRes = donorPrimary.getDB("config").runCommand({
             as: "history",
             depthField: "depthForTenantMigration"
         }},
-        // Now that we have the whole chain, filter out entries that occurred after
-        // `startFetchingTimestamp`, since these entries will be fetched during the oplog fetching
-        // phase.
-        {$set: {
-            history: {
-                $filter: {
-                    input: "$history",
-                    cond: {$lt: ["$$this.ts", startFetchingTimestamp]}
-                }
-            }
-        }},
         // Sort the oplog entries in each oplog chain.
         {$set: {
             history: {$reverseArray: {$reduce: {
@@ -351,6 +340,17 @@ const aggRes = donorPrimary.getDB("config").runCommand({
                 ]},
             }}},
         }},
+        // Now that we have the whole sorted chain, filter out entries that occurred after
+        // `startFetchingTimestamp`, since these entries will be fetched during the oplog fetching
+        // phase.
+        {$set: {
+            history: {
+                $filter: {
+                    input: "$history",
+                    cond: {$lt: ["$$this.ts", startFetchingTimestamp]}
+                }
+            }
+        }},
         // Combine the oplog entries.
         {$set: {history: {$concatArrays: ["$preImageOps", "$history", "$postImageOps"]}}},
         // Fetch the complete oplog entries and unwind oplog entries in each chain to the top-level
author	Wenbin Zhu <wenbin.zhu@mongodb.com>	2021-06-08 21:00:45 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2021-06-08 21:48:33 +0000
commit	18f91c4304086b0334d90c1d94a7d3c7225439bf (patch)
tree	b5d3f20e1b60791bb4be91008d3bc8e464053477 /jstests
parent	2c1191cc8ff9ead1c600a85f7a1adbac8e09b1e1 (diff)
download	mongo-18f91c4304086b0334d90c1d94a7d3c7225439bf.tar.gz