summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Russotto <matthew.russotto@mongodb.com>2022-04-29 14:47:45 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-05-16 16:35:44 +0000
commitfa5d029fd8a6f06e42be69a1ba7842ca35e39333 (patch)
tree9e4a3c38c44e3e8a7936a9b79e5182bd6df5022e
parent6501f4a1e5cfd08bd2a51c1700e125f5e4933b90 (diff)
downloadmongo-fa5d029fd8a6f06e42be69a1ba7842ca35e39333.tar.gz
SERVER-66089 Initial sync should do transaction table read with a later afterClusterTime.
(cherry picked from commit 878b6bb32413dc105133ffac31e9a0cf21e930f4)
-rw-r--r--etc/backports_required_for_multiversion_tests.yml4
-rw-r--r--jstests/replsets/initial_sync_with_partial_transaction.js98
-rw-r--r--src/mongo/db/repl/initial_syncer.cpp14
3 files changed, 111 insertions, 5 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index a0aa3bcafd1..2ba3f9189e9 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -218,6 +218,8 @@ last-continuous:
test_file: jstests/replsets/dbhash_lock_acquisition.js
- ticket: SERVER-66389
test_file: jstests/core/where_multiple_plans.js
+ - ticket: SERVER-66089
+ test_file: jstests/replsets/initial_sync_with_partial_transaction.js
# Tests that should only be excluded from particular suites should be listed under that suite.
suites:
@@ -571,6 +573,8 @@ last-lts:
test_file: jstests/replsets/dbhash_lock_acquisition.js
- ticket: SERVER-66389
test_file: jstests/core/where_multiple_plans.js
+ - ticket: SERVER-66089
+ test_file: jstests/replsets/initial_sync_with_partial_transaction.js
# Tests that should only be excluded from particular suites should be listed under that suite.
suites:
diff --git a/jstests/replsets/initial_sync_with_partial_transaction.js b/jstests/replsets/initial_sync_with_partial_transaction.js
new file mode 100644
index 00000000000..a96387bbb31
--- /dev/null
+++ b/jstests/replsets/initial_sync_with_partial_transaction.js
@@ -0,0 +1,98 @@
+/**
+ * Tests that an initial sync that starts in the middle of a large transaction on a secondary
+ * is able to complete and apply the entire transaction.
+ *
+ * @tags: [
+ * exclude_from_large_txns,
+ * uses_transactions,
+ * ]
+ */
+(function() {
+"use strict";
+load("jstests/replsets/rslib.js"); // For reconnect()
+load("jstests/libs/fail_point_util.js");
+
+const replTest = new ReplSetTest({
+ nodes: [{}, {rsConfig: {priority: 0}}],
+ nodeOptions: {
+ setParameter:
+ // We want two entries in each oplog batch, so the beginning of the transaction is not
+ // the end of the batch.
+ {maxNumberOfTransactionOperationsInSingleOplogEntry: 1, bgSyncOplogFetcherBatchSize: 2}
+ },
+});
+
+replTest.startSet();
+replTest.initiateWithHighElectionTimeout();
+
+// We have to add and pause the initial sync node here, because we cannot re-initiate the set while
+// the failpoints are held.
+jsTestLog("Adding initial sync node");
+const initialSyncNode = replTest.add({
+ rsConfig: {priority: 0, votes: 0},
+ setParameter: {
+ 'initialSyncSourceReadPreference': 'secondary',
+ 'failpoint.initialSyncHangBeforeChoosingSyncSource': tojson({mode: 'alwaysOn'}),
+ 'logComponentVerbosity': tojsononeline({replication: {initialSync: 1}})
+ }
+});
+replTest.reInitiate();
+
+const dbName = jsTest.name();
+const collName = "coll";
+
+const primary = replTest.getPrimary();
+const testDB = primary.getDB(dbName);
+const syncSource = replTest.nodes[1];
+
+assert.commandWorked(testDB.runCommand({create: collName, writeConcern: {w: "majority"}}));
+
+jsTest.log("Stop secondary oplog replication before the last operation in the transaction.");
+// The stopReplProducerOnDocument failpoint ensures that secondary stops replicating before
+// accepting the last two operations in the transaction.
+const stopReplProducerOnDocumentFailPoint = configureFailPoint(
+ syncSource, "stopReplProducerOnDocument", {document: {"applyOps.o._id": "next-last in txn"}});
+
+// This will cause us to pause batch application at a critical point, with "first in txn" and
+// "next in txn" in the oplog but not applied.
+const pauseBatchApplicationAfterWritingOplogEntriesFailPoint =
+ configureFailPoint(syncSource, "pauseBatchApplicationAfterWritingOplogEntries");
+
+jsTestLog("Starting transaction");
+const session = primary.startSession({causalConsistency: false});
+const sessionDB = session.getDatabase(dbName);
+session.startTransaction({writeConcern: {w: 1}});
+
+assert.commandWorked(sessionDB.getCollection(collName).insert({_id: "first in txn"}));
+assert.commandWorked(sessionDB.getCollection(collName).insert({_id: "next in txn"}));
+assert.commandWorked(sessionDB.getCollection(collName).insert({_id: "next-last in txn"}));
+assert.commandWorked(sessionDB.getCollection(collName).insert({_id: "last in txn"}));
+
+jsTestLog("Committing transaction");
+assert.commandWorked(session.commitTransaction_forTesting());
+stopReplProducerOnDocumentFailPoint.wait();
+pauseBatchApplicationAfterWritingOplogEntriesFailPoint.wait();
+
+jsTestLog("Starting initial sync");
+assert.commandWorked(initialSyncNode.adminCommand(
+ {configureFailPoint: "initialSyncHangBeforeChoosingSyncSource", mode: "off"}));
+
+// The bug we're testing for is a race. The way we fix the race is to wait until it resolves.
+// So it's very difficult to test it deterministically; if we use failpoints to force the race to
+// happen, we'll just hang when the code is correct. Instead we hold the sync source in the
+// condition that causes the race until we reach the point it's about to happen, then release
+// it hoping it'll fail most of the time when the bug is present.
+jsTestLog("Waiting to fetch the defaultBeginFetchingOplogEntry");
+checkLog.containsJson(initialSyncNode, 6608900);
+stopReplProducerOnDocumentFailPoint.off();
+pauseBatchApplicationAfterWritingOplogEntriesFailPoint.off();
+replTest.awaitSecondaryNodes();
+replTest.awaitReplication();
+
+// Make sure we got the transaction.
+assert.eq(4, initialSyncNode.getDB(dbName)[collName].find().itcount());
+
+printjson(initialSyncNode.getDB("local").oplog.rs.find().toArray());
+
+replTest.stopSet();
+})();
diff --git a/src/mongo/db/repl/initial_syncer.cpp b/src/mongo/db/repl/initial_syncer.cpp
index d1b9b699ada..02fb4af0be5 100644
--- a/src/mongo/db/repl/initial_syncer.cpp
+++ b/src/mongo/db/repl/initial_syncer.cpp
@@ -898,6 +898,10 @@ void InitialSyncer::_lastOplogEntryFetcherCallbackForDefaultBeginFetchingOpTime(
std::string logMsg = str::stream() << "Initial Syncer got the defaultBeginFetchingTimestamp: "
<< defaultBeginFetchingOpTime.toString();
pauseAtInitialSyncFuzzerSyncronizationPoints(logMsg);
+ LOGV2_DEBUG(6608900,
+ 1,
+ "Initial Syncer got the defaultBeginFetchingOpTime",
+ "defaultBeginFetchingOpTime"_attr = defaultBeginFetchingOpTime);
status = _scheduleGetBeginFetchingOpTime_inlock(onCompletionGuard, defaultBeginFetchingOpTime);
if (!status.isOK()) {
@@ -913,10 +917,10 @@ Status InitialSyncer::_scheduleGetBeginFetchingOpTime_inlock(
const auto preparedState = DurableTxnState_serializer(DurableTxnStateEnum::kPrepared);
const auto inProgressState = DurableTxnState_serializer(DurableTxnStateEnum::kInProgress);
- // Obtain the oldest active transaction timestamp from the remote by querying their
- // transactions table. To prevent oplog holes from causing this query to return an inaccurate
- // timestamp, we specify an afterClusterTime of Timestamp(0, 1) so that we wait for all previous
- // writes to be visible.
+ // Obtain the oldest active transaction timestamp from the remote by querying their transactions
+ // table. To prevent oplog holes (primary) or a stale lastAppliedSnapshot (secondary) from
+ // causing this query to return an inaccurate timestamp, we specify an afterClusterTime of the
+ // defaultBeginFetchingOpTime so that we wait for all previous writes to be visible.
BSONObjBuilder cmd;
cmd.append("find", NamespaceString::kSessionTransactionsTableNamespace.coll().toString());
cmd.append("filter",
@@ -925,7 +929,7 @@ Status InitialSyncer::_scheduleGetBeginFetchingOpTime_inlock(
cmd.append("readConcern",
BSON("level"
<< "local"
- << "afterClusterTime" << Timestamp(0, 1)));
+ << "afterClusterTime" << defaultBeginFetchingOpTime.getTimestamp()));
cmd.append("limit", 1);
_beginFetchingOpTimeFetcher = std::make_unique<Fetcher>(