From 5930149b09f51b1035b7a24556399a8777f9399c Mon Sep 17 00:00:00 2001 From: Vesselina Ratcheva Date: Thu, 25 Aug 2022 19:33:37 +0000 Subject: SERVER-66854 Prevent step-up ops from being killed by killSessions commands --- ...ared_transaction_kill_during_step_up_refresh.js | 70 ++++++++++ .../replsets/step_up_kill_abort_transactions.js | 149 +++++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100644 jstests/replsets/prepared_transaction_kill_during_step_up_refresh.js create mode 100644 jstests/replsets/step_up_kill_abort_transactions.js (limited to 'jstests/replsets') diff --git a/jstests/replsets/prepared_transaction_kill_during_step_up_refresh.js b/jstests/replsets/prepared_transaction_kill_during_step_up_refresh.js new file mode 100644 index 00000000000..2852269417d --- /dev/null +++ b/jstests/replsets/prepared_transaction_kill_during_step_up_refresh.js @@ -0,0 +1,70 @@ +/** + * Tests that the work to restore locks for prepared transactions on step up is not killable via + * killSessions commands. + * + * @tags: [uses_transactions, uses_prepare_transaction] + */ +(function() { +"use strict"; +load("jstests/core/txns/libs/prepare_helpers.js"); +load("jstests/libs/fail_point_util.js"); +load("jstests/replsets/rslib.js"); // For reconnect() + +const rst = new ReplSetTest({nodes: 2, name: jsTestName()}); +rst.startSet(); +rst.initiateWithHighElectionTimeout(); + +const dbName = "primaryDB"; +const collName = "testcoll"; + +const primary = rst.getPrimary(); +const newPrimary = rst.getSecondary(); + +const primaryDB = primary.getDB(dbName); +const primaryColl = primaryDB.getCollection(collName); +assert.commandWorked(primaryDB.runCommand({create: collName, writeConcern: {w: "majority"}})); + +jsTestName("Starting a transaction"); +const session = primary.startSession({causalConsistency: false}); +session.startTransaction({writeConcern: {w: "majority"}}); +const lsid = session.getSessionId().id; + +jsTestLog("LSID for our session is " + tojson(lsid)); + +jsTestLog("Inserting a doc in a transaction."); +const doc = { + _id: "txnDoc" +}; +assert.commandWorked(session.getDatabase(dbName).getCollection(collName).insert(doc)); + +jsTestLog("Putting transaction into prepare"); +const prepareTimestamp = PrepareHelpers.prepareTransaction(session); + +jsTestLog("Setting failpoint on new primary"); +const stepUpFP = configureFailPoint(newPrimary, "hangDuringStepUpPrepareRestoreLocks"); + +jsTestLog("Stepping up new primary"); +rst.stepUp(newPrimary, {awaitWritablePrimary: false}); +reconnect(primary); + +jsTestLog("Waiting on new primary to hit step up failpoint"); +stepUpFP.wait(); + +jsTestLog("Killing the session"); +const newPrimaryDB = newPrimary.getDB(dbName); +assert.commandWorked(newPrimaryDB.runCommand({killSessions: [{id: lsid}]})); + +jsTestLog("Allowing step up to continue"); +stepUpFP.off(); +assert(newPrimary, rst.getPrimary()); + +jsTestLog("Committing transaction on the new primary"); +// Create a proxy session to reuse the session state of the old primary. +const newSession = new _DelegatingDriverSession(newPrimary, session); + +assert.commandWorked(PrepareHelpers.commitTransaction(newSession, prepareTimestamp)); + +assert.eq(doc, primaryColl.findOne({}), primaryColl.find({}).toArray()); + +rst.stopSet(); +})(); diff --git a/jstests/replsets/step_up_kill_abort_transactions.js b/jstests/replsets/step_up_kill_abort_transactions.js new file mode 100644 index 00000000000..2c0ca84670a --- /dev/null +++ b/jstests/replsets/step_up_kill_abort_transactions.js @@ -0,0 +1,149 @@ +/** + * Tests that the work for aborting in-progress transactions on step up is not killable via + * killSessions commands. + * + * @tags: [ + * exclude_from_large_txns, + * uses_transactions, + * ] + */ + +(function() { +"use strict"; +load("jstests/replsets/rslib.js"); // For reconnect() +load("jstests/libs/fail_point_util.js"); + +function getTxnTableEntry(db) { + let txnTableEntries = db.getSiblingDB("config")["transactions"].find().toArray(); + assert.eq(txnTableEntries.length, 1); + return txnTableEntries[0]; +} + +const rst = new ReplSetTest({ + name: jsTestName(), + nodes: 3, + nodeOptions: { + setParameter: + // Make it easier to hold a transaction before it completes. + {maxNumberOfTransactionOperationsInSingleOplogEntry: 1, bgSyncOplogFetcherBatchSize: 1} + }, +}); + +rst.startSet(); +let config = rst.getReplSetConfig(); +config.members[2].priority = 0; +// Disable primary catchup and chaining. +config.settings = { + catchUpTimeoutMillis: 0, + chainingAllowed: false +}; +rst.initiate(config); + +setLogVerbosity(rst.nodes, {"replication": {"verbosity": 3}}); + +const dbName = "testdb"; +const collName = "testcoll"; + +const primary = rst.nodes[0]; +const primaryDB = primary.getDB(dbName); +const newPrimary = rst.nodes[1]; +const newPrimaryDB = newPrimary.getDB(dbName); + +assert.commandWorked(primaryDB.runCommand({create: collName, writeConcern: {w: "majority"}})); + +// Prevent the priority: 0 node from fetching new ops so that it can vote for the new primary. +const stopReplProducerFailPoint = configureFailPoint(rst.nodes[2], 'stopReplProducer'); + +jsTest.log("Stop secondary oplog replication before the last operation in the transaction."); +// The stopReplProducerOnDocument failpoint ensures that secondary stops replicating before +// applying the last operation in the transaction. This depends on the oplog fetcher batch size +// being 1. +const stopReplProducerOnDocumentFailPoint = configureFailPoint( + newPrimary, "stopReplProducerOnDocument", {document: {"applyOps.o._id": "last in txn"}}); + +jsTestLog("Start a transaction."); +const session = primary.startSession({causalConsistency: false}); +const sessionDB = session.getDatabase(dbName); +const sessionColl = sessionDB.getCollection(collName); +session.startTransaction({writeConcern: {w: "majority", wtimeout: 500}}); + +const lsid = session.getSessionId().id; +jsTestLog("LSID for our session is " + tojson(lsid)); + +jsTestLog("Add inserts to transaction."); +assert.commandWorked(sessionColl.insert({_id: "first in txn on primary " + primary})); +assert.commandWorked(sessionColl.insert({_id: "last in txn"})); + +jsTestLog("Confirm we cannot commit the transaction due to insufficient replication."); +let res = session.commitTransaction_forTesting(); +assert.commandFailedWithCode(res, ErrorCodes.WriteConcernFailed); + +jsTestLog("Find the start and commit optimes on the primary."); +let txnTableEntry = getTxnTableEntry(primaryDB); +assert.eq(txnTableEntry.state, "committed"); +const commitOpTime = txnTableEntry.lastWriteOpTime; +const startOpTime = + primaryDB.getSiblingDB("local").oplog.rs.findOne({ts: commitOpTime.ts}).prevOpTime; + +jsTestLog("Wait for the new primary to block on fail point."); +stopReplProducerOnDocumentFailPoint.wait(); + +jsTestLog("Wait for the new primary to apply the first op of transaction at timestamp: " + + tojson(startOpTime)); +assert.soon(() => { + const lastOpTime = getLastOpTime(newPrimary); + jsTestLog("Current lastOpTime on the new primary: " + tojson(lastOpTime)); + return rs.compareOpTimes(lastOpTime, startOpTime) >= 0; +}); + +// Now the transaction should be in-progress on the new primary. +txnTableEntry = getTxnTableEntry(newPrimaryDB); +assert.eq(txnTableEntry.state, "inProgress"); +// The startOpTime should be less than the commit optime. +assert.eq(rs.compareOpTimes(txnTableEntry.startOpTime, commitOpTime), -1); + +jsTestLog("Set step up failpoint on new primary"); +const stepUpFP = configureFailPoint(newPrimary, "hangDuringStepUpAbortInProgressTransactions"); + +jsTestLog("Step down primary via heartbeat."); +assert.commandWorked(newPrimary.adminCommand({replSetStepUp: 1})); +rst.awaitNodesAgreeOnPrimary(); +reconnect(primary); + +jsTestLog("Wait for the new primary to stop replication after primary catch-up."); +checkLog.contains(newPrimary, "Stopping replication producer"); + +jsTestLog("Enable replication on the new primary so that it can continue the state transition"); +stopReplProducerOnDocumentFailPoint.off(); + +jsTestLog("Wait on new primary to hit step up failpoint"); +stepUpFP.wait(); + +jsTestLog("Attempt to kill the session"); +assert.commandWorked(newPrimaryDB.runCommand({killSessions: [{id: lsid}]})); + +jsTestLog("Allow step up to continue"); +stepUpFP.off(); +assert.eq(rst.getPrimary(), newPrimary); +stopReplProducerFailPoint.off(); +rst.awaitReplication(); + +jsTestLog("Verifying that the transaction has been aborted on the new primary."); +// Create a proxy session to reuse the session state of the old primary. +const newSession = new _DelegatingDriverSession(newPrimary, session); +const newSessionDB = newSession.getDatabase(dbName); +// The transaction should have been aborted. +assert.commandFailedWithCode(newSessionDB.adminCommand({ + commitTransaction: 1, + txnNumber: NumberLong(newSession.getTxnNumber_forTesting()), + autocommit: false, + writeConcern: {w: "majority"} +}), + ErrorCodes.NoSuchTransaction); + +jsTestLog("Verifying that the collection was not changed by the transaction."); +assert.eq(primaryDB.getCollection(collName).find().itcount(), 0); +assert.eq(newPrimaryDB.getCollection(collName).find().itcount(), 0); + +rst.stopSet(); +})(); -- cgit v1.2.1