SERVER-59108 Resolve race with transaction operation not killed after stepdown

(cherry picked from commit 1b31e6ca3d25a35f31f48547aafe0ec33c8c9bfd)
author: Vesselina Ratcheva <vesselina.ratcheva@10gen.com> 2021-10-21 00:34:05 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2021-10-21 00:56:34 +0000
commit: 5bb885c0fd06484ca4b3a596b1959d64e3080e8b (patch)
tree: b72ec3d4ca4bfd28d19233b3110e6b8f79e0e3c0
parent: ab0980f5e5c20e11ebd2de0a40e515bf59defaef (diff)
download: mongo-5bb885c0fd06484ca4b3a596b1959d64e3080e8b.tar.gz
3 files changed, 103 insertions, 0 deletions
diff --git a/jstests/replsets/stepdown_race_with_transaction.js b/jstests/replsets/stepdown_race_with_transaction.js
new file mode 100644
index 00000000000..f1674f5ea3a
--- /dev/null
+++ b/jstests/replsets/stepdown_race_with_transaction.js
@@ -0,0 +1,78 @@
+/**
+ * Tests that multi-documment transactions no longer race with stepdown over
+ * "setAlwaysInterruptAtStepDownOrUp".
+ */
+
+(function() {
+"use strict";
+
+load("jstests/libs/fail_point_util.js");
+
+const rst = new ReplSetTest({nodes: 1});
+rst.startSet();
+rst.initiate();
+
+const dbName = "testdb";
+const collName = "testcoll";
+
+const primary = rst.getPrimary();
+const primaryDB = primary.getDB(dbName);
+const primaryColl = primaryDB.getCollection(collName);
+
+// Insert a document that we will later modify in a transaction.
+assert.commandWorked(primaryColl.insert({a: 1}));
+
+// In the first part of the race, we set the 'setAlwaysInterruptAtStepDownOrUp' too late,
+// after stepDown is already done interrupting operations.
+const txnFpBefore = configureFailPoint(primary, "hangBeforeSettingTxnInterruptFlag");
+
+// The second critical part of the race is when the transaction thread has already passed
+// the regular "not primary" checks by the time stepDown has completed and updated
+// writability. (This is the reason we check writability again in the accompanying patch.)
+const txnFpAfter =
+    configureFailPoint(primary, "hangAfterCheckingWritabilityForMultiDocumentTransactions");
+
+jsTestLog("Start the transaction in a parallel shell");
+const txn = startParallelShell(() => {
+    const session = db.getMongo().startSession();
+    const sessionDB = session.getDatabase("testdb");
+    const sessionColl = sessionDB.getCollection("testcoll");
+
+    session.startTransaction();
+    assert.commandFailedWithCode(sessionColl.insert({b: 2}), ErrorCodes.NotWritablePrimary);
+}, primary.port);
+
+jsTestLog("Wait on the first transaction fail point");
+txnFpBefore.wait();
+
+const stepdownFP = configureFailPoint(primary, "stepdownHangAfterGrabbingRSTL");
+
+jsTestLog("Issue a stepdown in a parallel shell");
+const stepdown = startParallelShell(() => {
+    assert.commandWorked(db.adminCommand({replSetStepDown: 10 * 60, force: true}));
+}, primary.port);
+
+jsTestLog("Wait on the stepdown fail point");
+stepdownFP.wait();
+
+// The txn will be forced to wait for stepdown to finish.
+jsTestLog("Release the first transaction fail point and wait in the second");
+txnFpBefore.off();
+txnFpAfter.wait();
+
+jsTestLog("Let stepdown finish");
+stepdownFP.off();
+stepdown();
+
+jsTestLog("Wait on the second transaction fail point");
+txnFpAfter.wait();
+
+jsTestLog("Let the transaction attempt finish");
+txnFpAfter.off();
+txn();
+
+jsTestLog("Checking that the transaction never succeeded");
+assert.eq(1, primaryColl.find().toArray().length);
+
+rst.stopSet();
+})();
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index 357d461c64f..29b9cfa4332 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -109,6 +109,8 @@ MONGO_FAIL_POINT_DEFINE(holdStableTimestampAtSpecificTimestamp);
 MONGO_FAIL_POINT_DEFINE(stepdownHangBeforeRSTLEnqueue);
 // Fail setMaintenanceMode with ErrorCodes::NotSecondary to simulate a concurrent election.
 MONGO_FAIL_POINT_DEFINE(setMaintenanceModeFailsWithNotSecondary);
+// Hang after grabbing the RSTL but before we start rejecting writes.
+MONGO_FAIL_POINT_DEFINE(stepdownHangAfterGrabbingRSTL);
 
 // Tracks the last state transition performed in this replca set.
 std::string lastStateTransition;
@@ -2072,6 +2074,9 @@ void ReplicationCoordinatorImpl::stepDown(OperationContext* opCtx,
     AutoGetRstlForStepUpStepDown arsd(
         this, opCtx, ReplicationCoordinator::OpsKillingStateTransitionEnum::kStepDown, deadline);
 
+    CurOpFailpointHelpers::waitWhileFailPointEnabled(
+        &stepdownHangAfterGrabbingRSTL, opCtx, "stepdownHangAfterGrabbingRSTL");
+
     stdx::unique_lock<Latch> lk(_mutex);
 
     opCtx->checkForInterrupt();
diff --git a/src/mongo/db/service_entry_point_common.cpp b/src/mongo/db/service_entry_point_common.cpp
index 0b01b2533fd..c1acb9cc44c 100644
--- a/src/mongo/db/service_entry_point_common.cpp
+++ b/src/mongo/db/service_entry_point_common.cpp
@@ -44,6 +44,7 @@
 #include "mongo/db/commands.h"
 #include "mongo/db/commands/test_commands_enabled.h"
 #include "mongo/db/commands/txn_cmds_gen.h"
+#include "mongo/db/concurrency/replication_state_transition_lock_guard.h"
 #include "mongo/db/curop.h"
 #include "mongo/db/curop_failpoint_helpers.h"
 #include "mongo/db/curop_metrics.h"
@@ -105,6 +106,8 @@ MONGO_FAIL_POINT_DEFINE(sleepMillisAfterCommandExecutionBegins);
 MONGO_FAIL_POINT_DEFINE(waitAfterCommandFinishesExecution);
 MONGO_FAIL_POINT_DEFINE(waitAfterNewStatementBlocksBehindPrepare);
 MONGO_FAIL_POINT_DEFINE(hangBeforeSessionCheckOut);
+MONGO_FAIL_POINT_DEFINE(hangBeforeSettingTxnInterruptFlag);
+MONGO_FAIL_POINT_DEFINE(hangAfterCheckingWritabilityForMultiDocumentTransactions);
 
 // Tracks the number of times a legacy unacknowledged write failed due to
 // not primary error resulted in network disconnection.
@@ -746,6 +749,8 @@ void execCommandDatabase(OperationContext* opCtx,
             // Kill this operation on step down even if it hasn't taken write locks yet, because it
             // could conflict with transactions from a new primary.
             if (inMultiDocumentTransaction) {
+                CurOpFailpointHelpers::waitWhileFailPointEnabled(
+                    &hangBeforeSettingTxnInterruptFlag, opCtx, "hangBeforeSettingTxnInterruptFlag");
                 opCtx->setAlwaysInterruptAtStepDownOrUp();
             }
 
@@ -787,6 +792,21 @@ void execCommandDatabase(OperationContext* opCtx,
                         "node is in drain mode",
                         optedIn || alwaysAllowed);
             }
+
+            // We acquire the RSTL which helps us here in two ways:
+            // 1) It forces us to wait out any outstanding stepdown attempts.
+            // 2) It guarantees that future RSTL holders will see the
+            // 'setAlwaysInterruptAtStepDownOrUp' flag we set above.
+            if (inMultiDocumentTransaction) {
+                CurOpFailpointHelpers::waitWhileFailPointEnabled(
+                    &hangAfterCheckingWritabilityForMultiDocumentTransactions,
+                    opCtx,
+                    "hangAfterCheckingWritabilityForMultiDocumentTransactions");
+                repl::ReplicationStateTransitionLockGuard rstl(opCtx, MODE_IX);
+                uassert(ErrorCodes::NotWritablePrimary,
+                        "Cannot start a transaction in a non-primary state",
+                        replCoord->canAcceptWritesForDatabase(opCtx, dbname));
+            }
         }
 
         if (command->adminOnly()) {
author	Vesselina Ratcheva <vesselina.ratcheva@10gen.com>	2021-10-21 00:34:05 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2021-10-21 00:56:34 +0000
commit	5bb885c0fd06484ca4b3a596b1959d64e3080e8b (patch)
tree	b72ec3d4ca4bfd28d19233b3110e6b8f79e0e3c0
parent	ab0980f5e5c20e11ebd2de0a40e515bf59defaef (diff)
download	mongo-5bb885c0fd06484ca4b3a596b1959d64e3080e8b.tar.gz