diff options
-rw-r--r-- | etc/backports_required_for_multiversion_tests.yml | 4 | ||||
-rw-r--r-- | jstests/replsets/initial_sync_with_partial_transaction.js | 98 | ||||
-rw-r--r-- | src/mongo/db/repl/initial_syncer.cpp | 14 |
3 files changed, 111 insertions, 5 deletions
diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index a0aa3bcafd1..2ba3f9189e9 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -218,6 +218,8 @@ last-continuous: test_file: jstests/replsets/dbhash_lock_acquisition.js - ticket: SERVER-66389 test_file: jstests/core/where_multiple_plans.js + - ticket: SERVER-66089 + test_file: jstests/replsets/initial_sync_with_partial_transaction.js # Tests that should only be excluded from particular suites should be listed under that suite. suites: @@ -571,6 +573,8 @@ last-lts: test_file: jstests/replsets/dbhash_lock_acquisition.js - ticket: SERVER-66389 test_file: jstests/core/where_multiple_plans.js + - ticket: SERVER-66089 + test_file: jstests/replsets/initial_sync_with_partial_transaction.js # Tests that should only be excluded from particular suites should be listed under that suite. suites: diff --git a/jstests/replsets/initial_sync_with_partial_transaction.js b/jstests/replsets/initial_sync_with_partial_transaction.js new file mode 100644 index 00000000000..a96387bbb31 --- /dev/null +++ b/jstests/replsets/initial_sync_with_partial_transaction.js @@ -0,0 +1,98 @@ +/** + * Tests that an initial sync that starts in the middle of a large transaction on a secondary + * is able to complete and apply the entire transaction. + * + * @tags: [ + * exclude_from_large_txns, + * uses_transactions, + * ] + */ +(function() { +"use strict"; +load("jstests/replsets/rslib.js"); // For reconnect() +load("jstests/libs/fail_point_util.js"); + +const replTest = new ReplSetTest({ + nodes: [{}, {rsConfig: {priority: 0}}], + nodeOptions: { + setParameter: + // We want two entries in each oplog batch, so the beginning of the transaction is not + // the end of the batch. + {maxNumberOfTransactionOperationsInSingleOplogEntry: 1, bgSyncOplogFetcherBatchSize: 2} + }, +}); + +replTest.startSet(); +replTest.initiateWithHighElectionTimeout(); + +// We have to add and pause the initial sync node here, because we cannot re-initiate the set while +// the failpoints are held. +jsTestLog("Adding initial sync node"); +const initialSyncNode = replTest.add({ + rsConfig: {priority: 0, votes: 0}, + setParameter: { + 'initialSyncSourceReadPreference': 'secondary', + 'failpoint.initialSyncHangBeforeChoosingSyncSource': tojson({mode: 'alwaysOn'}), + 'logComponentVerbosity': tojsononeline({replication: {initialSync: 1}}) + } +}); +replTest.reInitiate(); + +const dbName = jsTest.name(); +const collName = "coll"; + +const primary = replTest.getPrimary(); +const testDB = primary.getDB(dbName); +const syncSource = replTest.nodes[1]; + +assert.commandWorked(testDB.runCommand({create: collName, writeConcern: {w: "majority"}})); + +jsTest.log("Stop secondary oplog replication before the last operation in the transaction."); +// The stopReplProducerOnDocument failpoint ensures that secondary stops replicating before +// accepting the last two operations in the transaction. +const stopReplProducerOnDocumentFailPoint = configureFailPoint( + syncSource, "stopReplProducerOnDocument", {document: {"applyOps.o._id": "next-last in txn"}}); + +// This will cause us to pause batch application at a critical point, with "first in txn" and +// "next in txn" in the oplog but not applied. +const pauseBatchApplicationAfterWritingOplogEntriesFailPoint = + configureFailPoint(syncSource, "pauseBatchApplicationAfterWritingOplogEntries"); + +jsTestLog("Starting transaction"); +const session = primary.startSession({causalConsistency: false}); +const sessionDB = session.getDatabase(dbName); +session.startTransaction({writeConcern: {w: 1}}); + +assert.commandWorked(sessionDB.getCollection(collName).insert({_id: "first in txn"})); +assert.commandWorked(sessionDB.getCollection(collName).insert({_id: "next in txn"})); +assert.commandWorked(sessionDB.getCollection(collName).insert({_id: "next-last in txn"})); +assert.commandWorked(sessionDB.getCollection(collName).insert({_id: "last in txn"})); + +jsTestLog("Committing transaction"); +assert.commandWorked(session.commitTransaction_forTesting()); +stopReplProducerOnDocumentFailPoint.wait(); +pauseBatchApplicationAfterWritingOplogEntriesFailPoint.wait(); + +jsTestLog("Starting initial sync"); +assert.commandWorked(initialSyncNode.adminCommand( + {configureFailPoint: "initialSyncHangBeforeChoosingSyncSource", mode: "off"})); + +// The bug we're testing for is a race. The way we fix the race is to wait until it resolves. +// So it's very difficult to test it deterministically; if we use failpoints to force the race to +// happen, we'll just hang when the code is correct. Instead we hold the sync source in the +// condition that causes the race until we reach the point it's about to happen, then release +// it hoping it'll fail most of the time when the bug is present. +jsTestLog("Waiting to fetch the defaultBeginFetchingOplogEntry"); +checkLog.containsJson(initialSyncNode, 6608900); +stopReplProducerOnDocumentFailPoint.off(); +pauseBatchApplicationAfterWritingOplogEntriesFailPoint.off(); +replTest.awaitSecondaryNodes(); +replTest.awaitReplication(); + +// Make sure we got the transaction. +assert.eq(4, initialSyncNode.getDB(dbName)[collName].find().itcount()); + +printjson(initialSyncNode.getDB("local").oplog.rs.find().toArray()); + +replTest.stopSet(); +})(); diff --git a/src/mongo/db/repl/initial_syncer.cpp b/src/mongo/db/repl/initial_syncer.cpp index d1b9b699ada..02fb4af0be5 100644 --- a/src/mongo/db/repl/initial_syncer.cpp +++ b/src/mongo/db/repl/initial_syncer.cpp @@ -898,6 +898,10 @@ void InitialSyncer::_lastOplogEntryFetcherCallbackForDefaultBeginFetchingOpTime( std::string logMsg = str::stream() << "Initial Syncer got the defaultBeginFetchingTimestamp: " << defaultBeginFetchingOpTime.toString(); pauseAtInitialSyncFuzzerSyncronizationPoints(logMsg); + LOGV2_DEBUG(6608900, + 1, + "Initial Syncer got the defaultBeginFetchingOpTime", + "defaultBeginFetchingOpTime"_attr = defaultBeginFetchingOpTime); status = _scheduleGetBeginFetchingOpTime_inlock(onCompletionGuard, defaultBeginFetchingOpTime); if (!status.isOK()) { @@ -913,10 +917,10 @@ Status InitialSyncer::_scheduleGetBeginFetchingOpTime_inlock( const auto preparedState = DurableTxnState_serializer(DurableTxnStateEnum::kPrepared); const auto inProgressState = DurableTxnState_serializer(DurableTxnStateEnum::kInProgress); - // Obtain the oldest active transaction timestamp from the remote by querying their - // transactions table. To prevent oplog holes from causing this query to return an inaccurate - // timestamp, we specify an afterClusterTime of Timestamp(0, 1) so that we wait for all previous - // writes to be visible. + // Obtain the oldest active transaction timestamp from the remote by querying their transactions + // table. To prevent oplog holes (primary) or a stale lastAppliedSnapshot (secondary) from + // causing this query to return an inaccurate timestamp, we specify an afterClusterTime of the + // defaultBeginFetchingOpTime so that we wait for all previous writes to be visible. BSONObjBuilder cmd; cmd.append("find", NamespaceString::kSessionTransactionsTableNamespace.coll().toString()); cmd.append("filter", @@ -925,7 +929,7 @@ Status InitialSyncer::_scheduleGetBeginFetchingOpTime_inlock( cmd.append("readConcern", BSON("level" << "local" - << "afterClusterTime" << Timestamp(0, 1))); + << "afterClusterTime" << defaultBeginFetchingOpTime.getTimestamp())); cmd.append("limit", 1); _beginFetchingOpTimeFetcher = std::make_unique<Fetcher>( |