3 files changed, 111 insertions, 5 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml
index a0aa3bcafd1..2ba3f9189e9 100644
--- a/etc/backports_required_for_multiversion_tests.yml
+++ b/etc/backports_required_for_multiversion_tests.yml
@@ -218,6 +218,8 @@ last-continuous:
       test_file: jstests/replsets/dbhash_lock_acquisition.js
     - ticket: SERVER-66389
       test_file: jstests/core/where_multiple_plans.js
+    - ticket: SERVER-66089
+      test_file: jstests/replsets/initial_sync_with_partial_transaction.js
 
   # Tests that should only be excluded from particular suites should be listed under that suite.
   suites:
@@ -571,6 +573,8 @@ last-lts:
       test_file: jstests/replsets/dbhash_lock_acquisition.js
     - ticket: SERVER-66389
       test_file: jstests/core/where_multiple_plans.js
+    - ticket: SERVER-66089
+      test_file: jstests/replsets/initial_sync_with_partial_transaction.js
 
   # Tests that should only be excluded from particular suites should be listed under that suite.
   suites:
diff --git a/jstests/replsets/initial_sync_with_partial_transaction.js b/jstests/replsets/initial_sync_with_partial_transaction.js
new file mode 100644
index 00000000000..a96387bbb31
--- /dev/null
+++ b/jstests/replsets/initial_sync_with_partial_transaction.js
@@ -0,0 +1,98 @@
+/**
+ * Tests that an initial sync that starts in the middle of a large transaction on a secondary
+ * is able to complete and apply the entire transaction.
+ *
+ * @tags: [
+ *   exclude_from_large_txns,
+ *   uses_transactions,
+ * ]
+ */
+(function() {
+"use strict";
+load("jstests/replsets/rslib.js");  // For reconnect()
+load("jstests/libs/fail_point_util.js");
+
+const replTest = new ReplSetTest({
+    nodes: [{}, {rsConfig: {priority: 0}}],
+    nodeOptions: {
+        setParameter:
+            // We want two entries in each oplog batch, so the beginning of the transaction is not
+            // the end of the batch.
+            {maxNumberOfTransactionOperationsInSingleOplogEntry: 1, bgSyncOplogFetcherBatchSize: 2}
+    },
+});
+
+replTest.startSet();
+replTest.initiateWithHighElectionTimeout();
+
+// We have to add and pause the initial sync node here, because we cannot re-initiate the set while
+// the failpoints are held.
+jsTestLog("Adding initial sync node");
+const initialSyncNode = replTest.add({
+    rsConfig: {priority: 0, votes: 0},
+    setParameter: {
+        'initialSyncSourceReadPreference': 'secondary',
+        'failpoint.initialSyncHangBeforeChoosingSyncSource': tojson({mode: 'alwaysOn'}),
+        'logComponentVerbosity': tojsononeline({replication: {initialSync: 1}})
+    }
+});
+replTest.reInitiate();
+
+const dbName = jsTest.name();
+const collName = "coll";
+
+const primary = replTest.getPrimary();
+const testDB = primary.getDB(dbName);
+const syncSource = replTest.nodes[1];
+
+assert.commandWorked(testDB.runCommand({create: collName, writeConcern: {w: "majority"}}));
+
+jsTest.log("Stop secondary oplog replication before the last operation in the transaction.");
+// The stopReplProducerOnDocument failpoint ensures that secondary stops replicating before
+// accepting the last two operations in the transaction.
+const stopReplProducerOnDocumentFailPoint = configureFailPoint(
+    syncSource, "stopReplProducerOnDocument", {document: {"applyOps.o._id": "next-last in txn"}});
+
+// This will cause us to pause batch application at a critical point, with "first in txn" and
+// "next in txn" in the oplog but not applied.
+const pauseBatchApplicationAfterWritingOplogEntriesFailPoint =
+    configureFailPoint(syncSource, "pauseBatchApplicationAfterWritingOplogEntries");
+
+jsTestLog("Starting transaction");
+const session = primary.startSession({causalConsistency: false});
+const sessionDB = session.getDatabase(dbName);
+session.startTransaction({writeConcern: {w: 1}});
+
+assert.commandWorked(sessionDB.getCollection(collName).insert({_id: "first in txn"}));
+assert.commandWorked(sessionDB.getCollection(collName).insert({_id: "next in txn"}));
+assert.commandWorked(sessionDB.getCollection(collName).insert({_id: "next-last in txn"}));
+assert.commandWorked(sessionDB.getCollection(collName).insert({_id: "last in txn"}));
+
+jsTestLog("Committing transaction");
+assert.commandWorked(session.commitTransaction_forTesting());
+stopReplProducerOnDocumentFailPoint.wait();
+pauseBatchApplicationAfterWritingOplogEntriesFailPoint.wait();
+
+jsTestLog("Starting initial sync");
+assert.commandWorked(initialSyncNode.adminCommand(
+    {configureFailPoint: "initialSyncHangBeforeChoosingSyncSource", mode: "off"}));
+
+// The bug we're testing for is a race.  The way we fix the race is to wait until it resolves.
+// So it's very difficult to test it deterministically; if we use failpoints to force the race to
+// happen, we'll just hang when the code is correct.  Instead we hold the sync source in the
+// condition that causes the race until we reach the point it's about to happen, then release
+// it hoping it'll fail most of the time when the bug is present.
+jsTestLog("Waiting to fetch the defaultBeginFetchingOplogEntry");
+checkLog.containsJson(initialSyncNode, 6608900);
+stopReplProducerOnDocumentFailPoint.off();
+pauseBatchApplicationAfterWritingOplogEntriesFailPoint.off();
+replTest.awaitSecondaryNodes();
+replTest.awaitReplication();
+
+// Make sure we got the transaction.
+assert.eq(4, initialSyncNode.getDB(dbName)[collName].find().itcount());
+
+printjson(initialSyncNode.getDB("local").oplog.rs.find().toArray());
+
+replTest.stopSet();
+})();
diff --git a/src/mongo/db/repl/initial_syncer.cpp b/src/mongo/db/repl/initial_syncer.cpp
index d1b9b699ada..02fb4af0be5 100644
--- a/src/mongo/db/repl/initial_syncer.cpp
+++ b/src/mongo/db/repl/initial_syncer.cpp
@@ -898,6 +898,10 @@ void InitialSyncer::_lastOplogEntryFetcherCallbackForDefaultBeginFetchingOpTime(
     std::string logMsg = str::stream() << "Initial Syncer got the defaultBeginFetchingTimestamp: "
                                        << defaultBeginFetchingOpTime.toString();
     pauseAtInitialSyncFuzzerSyncronizationPoints(logMsg);
+    LOGV2_DEBUG(6608900,
+                1,
+                "Initial Syncer got the defaultBeginFetchingOpTime",
+                "defaultBeginFetchingOpTime"_attr = defaultBeginFetchingOpTime);
 
     status = _scheduleGetBeginFetchingOpTime_inlock(onCompletionGuard, defaultBeginFetchingOpTime);
     if (!status.isOK()) {
@@ -913,10 +917,10 @@ Status InitialSyncer::_scheduleGetBeginFetchingOpTime_inlock(
     const auto preparedState = DurableTxnState_serializer(DurableTxnStateEnum::kPrepared);
     const auto inProgressState = DurableTxnState_serializer(DurableTxnStateEnum::kInProgress);
 
-    // Obtain the oldest active transaction timestamp from the remote by querying their
-    // transactions table. To prevent oplog holes from causing this query to return an inaccurate
-    // timestamp, we specify an afterClusterTime of Timestamp(0, 1) so that we wait for all previous
-    // writes to be visible.
+    // Obtain the oldest active transaction timestamp from the remote by querying their transactions
+    // table. To prevent oplog holes (primary) or a stale lastAppliedSnapshot (secondary) from
+    // causing this query to return an inaccurate timestamp, we specify an afterClusterTime of the
+    // defaultBeginFetchingOpTime so that we wait for all previous writes to be visible.
     BSONObjBuilder cmd;
     cmd.append("find", NamespaceString::kSessionTransactionsTableNamespace.coll().toString());
     cmd.append("filter",
@@ -925,7 +929,7 @@ Status InitialSyncer::_scheduleGetBeginFetchingOpTime_inlock(
     cmd.append("readConcern",
                BSON("level"
                     << "local"
-                    << "afterClusterTime" << Timestamp(0, 1)));
+                    << "afterClusterTime" << defaultBeginFetchingOpTime.getTimestamp()));
     cmd.append("limit", 1);
 
     _beginFetchingOpTimeFetcher = std::make_unique<Fetcher>(