SERVER-41037 kill all user operations that hit prepare conflicts on step up

author: Pavi Vetriselvan <pvselvan@umich.edu> 2019-05-23 14:30:45 -0400
committer: Pavi Vetriselvan <pvselvan@umich.edu> 2019-05-23 14:32:25 -0400
commit: 9b8814b6e7130652eca6f8fdaa9ff215b07d2ebb (patch)
tree: ef6b1f1b6e75a1ee830c42edf579b9711703c2c2
parent: d08633878f0cc33b5484decdb8df17845c842089 (diff)
download: mongo-9b8814b6e7130652eca6f8fdaa9ff215b07d2ebb.tar.gz
3 files changed, 135 insertions, 6 deletions
diff --git a/jstests/replsets/kills_reads_with_prepare_conflicts_during_stepup.js b/jstests/replsets/kills_reads_with_prepare_conflicts_during_stepup.js
new file mode 100644
index 00000000000..3ad0bd22248
--- /dev/null
+++ b/jstests/replsets/kills_reads_with_prepare_conflicts_during_stepup.js
@@ -0,0 +1,128 @@
+/*
+ * Tests that a read operation on a secondary that encounters a prepare conflict gets killed
+ * when we cause the secondary to step up.
+ *
+ * @tags: [uses_transactions, uses_prepare_transaction]
+ */
+
+(function() {
+    "use strict";
+
+    load("jstests/core/txns/libs/prepare_helpers.js");
+    load("jstests/libs/check_log.js");
+
+    var rst = new ReplSetTest({nodes: 2});
+    rst.startSet();
+
+    const config = rst.getReplSetConfig();
+    // Increase the election timeout so that we do not accidentally trigger an election before
+    // we make the secondary step up.
+    config.settings = {"electionTimeoutMillis": 12 * 60 * 60 * 1000};
+    rst.initiate(config);
+
+    let primary = rst.getPrimary();
+    let secondary = rst.getSecondary();
+
+    const dbName = "test";
+    const collName = "kill_reads_with_prepare_conflicts_during_step_up";
+
+    const primaryDB = primary.getDB(dbName);
+    const primaryColl = primaryDB[collName];
+
+    let session = primary.startSession();
+    const sessionID = session.getSessionId();
+    let sessionDB = session.getDatabase(dbName);
+    const sessionColl = sessionDB.getCollection(collName);
+
+    assert.commandWorked(secondary.adminCommand(
+        {configureFailPoint: "WTPrintPrepareConflictLog", mode: "alwaysOn"}));
+
+    // Insert a document that we will later modify in a transaction.
+    assert.commandWorked(primaryColl.insert({_id: 1}));
+
+    jsTestLog("Start a transaction and prepare it");
+    session.startTransaction();
+    assert.commandWorked(sessionColl.update({_id: 1}, {_id: 1, a: 1}));
+    const prepareTimestamp = PrepareHelpers.prepareTransaction(session);
+
+    // Advance the clusterTime with another insert.
+    const clusterTimeAfterPrepare =
+        assert
+            .commandWorked(primaryColl.runCommand(
+                "insert", {documents: [{advanceClusterTime: 1}], writeConcern: {w: "majority"}}))
+            .operationTime;
+
+    // Ensure that the secondary replicates the prepare and the additional insert.
+    rst.awaitReplication();
+
+    // Make sure a secondary read using afterClusterTime times out when trying to
+    // read a prepared document.
+    const secondaryDB = secondary.getDB(dbName);
+    assert.commandFailedWithCode(secondaryDB.runCommand({
+        find: collName,
+        filter: {_id: 1},
+        readConcern: {afterClusterTime: clusterTimeAfterPrepare},
+        maxTimeMS: 2 * 1000  // 2 seconds
+    }),
+                                 ErrorCodes.MaxTimeMSExpired);
+
+    // Clear secondary log so that when we wait for the WTPrintPrepareConflictLog fail point, we
+    // do not count the previous find.
+    assert.commandWorked(secondaryDB.adminCommand({clearLog: "global"}));
+
+    TestData.dbName = dbName;
+    TestData.collName = collName;
+    TestData.clusterTime = clusterTimeAfterPrepare;
+
+    const waitForSecondaryReadBlockedOnPrepareConflictThread = startParallelShell(() => {
+        // Allow for secondary reads.
+        db.getMongo().setSlaveOk();
+        const parallelTestDB = db.getSiblingDB(TestData.dbName);
+        const parallelTestCollName = TestData.collName;
+
+        // The following read should block on the prepared transaction since it will be
+        // reading a conflicting document using an afterClusterTime later than the
+        // prepareTimestamp.
+        assert.commandFailedWithCode(parallelTestDB.runCommand({
+            find: parallelTestCollName,
+            filter: {_id: 1},
+            readConcern: {afterClusterTime: TestData.clusterTime}
+        }),
+                                     ErrorCodes.InterruptedDueToReplStateChange);
+    }, secondary.port);
+
+    jsTestLog("Waiting for failpoint");
+    checkLog.contains(secondary, "WTPrintPrepareConflictLog fail point enabled");
+
+    // Once we've confirmed that the find command has hit a prepare conflict on the secondary, cause
+    // that secondary to step up.
+    jsTestLog("Stepping up secondary");
+    rst.stepUp(secondary);
+
+    waitForSecondaryReadBlockedOnPrepareConflictThread();
+
+    rst.waitForState(secondary, ReplSetTest.State.PRIMARY);
+    rst.waitForState(primary, ReplSetTest.State.SECONDARY);
+
+    primary = rst.getPrimary();
+
+    // Make sure we can successfully commit the prepared transaction.
+    jsTestLog("Restoring shell session state");
+    session = PrepareHelpers.createSessionWithGivenId(primary, sessionID);
+    sessionDB = session.getDatabase(dbName);
+    // The transaction on this session should have a txnNumber of 0. We explicitly set this
+    // since createSessionWithGivenId does not restore the current txnNumber in the shell.
+    session.setTxnNumber_forTesting(0);
+    const txnNumber = session.getTxnNumber_forTesting();
+
+    jsTestLog("Committing transaction");
+    // Commit the transaction.
+    assert.commandWorked(sessionDB.adminCommand({
+        commitTransaction: 1,
+        commitTimestamp: prepareTimestamp,
+        txnNumber: NumberLong(txnNumber),
+        autocommit: false,
+    }));
+
+    rst.stopSet();
+})();
+\ No newline at end of file
diff --git a/src/mongo/db/prepare_conflict_tracker.h b/src/mongo/db/prepare_conflict_tracker.h
index e9deec22022..48e4fa1a063 100644
--- a/src/mongo/db/prepare_conflict_tracker.h
+++ b/src/mongo/db/prepare_conflict_tracker.h
@@ -36,11 +36,8 @@ namespace mongo {
 
 /**
  * The PrepareConflictTracker tracks if a read operation encounters a prepare conflict. If it
- * is blocked on a prepare conflict, we will kill the operation during step down. This will
- * help us avoid deadlocks between prepare conflicts and state transitions.
- *
- * TODO SERVER-41037: Modify above comment to include step up or use "state transitions" to
- * encompass both.
+ * is blocked on a prepare conflict, we will kill the operation during state transitions (step
+ * up/step down). This will help us avoid deadlocks between prepare conflicts and state transitions.
  */
 class PrepareConflictTracker {
 public:
diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp
index dceb978a6f2..87bed913f8a 100644
--- a/src/mongo/db/repl/replication_coordinator_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_impl.cpp
@@ -1025,7 +1025,11 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
 
     _externalState->onDrainComplete(opCtx);
 
-    ReplicationStateTransitionLockGuard transitionGuard(opCtx, MODE_X);
+    // Kill all user writes and user reads that encounter a prepare conflict. Also kills select
+    // internal operations. Although secondaries cannot accept writes, a step up can kill writes
+    // that were blocked behind the RSTL lock held by a step down attempt. These writes will be
+    // killed with a retryable error code during step up.
+    AutoGetRstlForStepUpStepDown arsu(this, opCtx);
     lk.lock();
 
     // Exit drain mode only if we're actually in draining mode, the apply buffer is empty in the
author	Pavi Vetriselvan <pvselvan@umich.edu>	2019-05-23 14:30:45 -0400
committer	Pavi Vetriselvan <pvselvan@umich.edu>	2019-05-23 14:32:25 -0400
commit	9b8814b6e7130652eca6f8fdaa9ff215b07d2ebb (patch)
tree	ef6b1f1b6e75a1ee830c42edf579b9711703c2c2
parent	d08633878f0cc33b5484decdb8df17845c842089 (diff)
download	mongo-9b8814b6e7130652eca6f8fdaa9ff215b07d2ebb.tar.gz