summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Gottlieb <daniel.gottlieb@mongodb.com>2021-06-10 09:25:55 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-06-11 00:40:55 +0000
commit379db12e4d046c82f60732bcf11c98019f04bbf5 (patch)
tree219a52f6975da31c1a9a5cf3f36e18dc6099a9db
parent47f1fccaf3db0116b5f6e719651c35313a6d89fe (diff)
downloadmongo-379db12e4d046c82f60732bcf11c98019f04bbf5.tar.gz
SERVER-57476: Return a WriteConflict when a timestamped transaction hits a prepare conflict.
(cherry picked from commit 1e7e343fb6c90fbf0c62deabf61630353e2e5e29)
-rw-r--r--jstests/replsets/assert_on_prepare_conflict_with_hole.js134
-rw-r--r--src/mongo/db/storage/recovery_unit.h8
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.cpp15
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.h14
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h4
5 files changed, 175 insertions, 0 deletions
diff --git a/jstests/replsets/assert_on_prepare_conflict_with_hole.js b/jstests/replsets/assert_on_prepare_conflict_with_hole.js
new file mode 100644
index 00000000000..9138eaf324c
--- /dev/null
+++ b/jstests/replsets/assert_on_prepare_conflict_with_hole.js
@@ -0,0 +1,134 @@
+/**
+ * Constructs the following cycle that can lead to stalling a sharded cluster:
+ * | Preparer | Insert | OplogVisibility Ts |
+ * |---------------------------------------+---------------------------+--------------------|
+ * | BeginTxn | | |
+ * | Write A | | |
+ * | | BeginTxn | |
+ * | | Preallocates TS(10) | 9 |
+ * | (side txn commits prepare oplog @ 11) | | |
+ * | Prepare 11 | | |
+ * | | Write A (PrepareConflict) | |
+ *
+ * In this scenario, the prepared transaction blocks waiting for its prepare oplog entry at
+ * timestamp 11 to become majority committed. However, the prepare oplog entry cannot replicate to
+ * secondaries until the oplog visibility timestamp advances to 11. The oplog visibility timestamp
+ * advancing is blocked on the insert that allocated timestamps 10. The insert cannot make progress
+ * because it has hit a prepare conflict. The prepare conflict this test specifically exercises is
+ * for duplicate key detection on a non-_id unique index.
+ *
+ * @tags: [multiversion_incompatible, uses_transactions, uses_prepare_transaction]
+ */
+(function() {
+"use strict";
+
+load("jstests/libs/parallelTester.js");
+
+// Use a single node replica set for simplicity. Note that an oplog hole on a single node replica
+// will block new writes from becoming majority committed.
+const rst = new ReplSetTest({
+ nodes: 1,
+ nodeOptions: {
+ setParameter: {logComponentVerbosity: tojson({storage: 1})},
+ }
+});
+rst.startSet();
+rst.initiate();
+
+const primary = rst.getPrimary();
+const db = primary.getDB("test");
+
+const collName = "mycoll";
+assert.commandWorked(db.runCommand({create: collName, writeConcern: {w: "majority"}}));
+// A secondary unique index requires cursor positioning in WT which can result in hitting a prepare
+// conflict.
+assert.commandWorked(db[collName].createIndex({a: 1}, {unique: true}));
+
+// Start a multi-document transaction that inserts an `a: 2` update.
+const lsid = ({id: UUID()});
+assert.commandWorked(db.runCommand({
+ insert: collName,
+ documents: [{a: 2}],
+ lsid,
+ txnNumber: NumberLong(1),
+ autocommit: false,
+ startTransaction: true,
+}));
+
+// Prepare the `a: 2` update.
+let prepTs = assert.commandWorked(db.adminCommand({
+ prepareTransaction: 1,
+ lsid,
+ txnNumber: NumberLong(1),
+ autocommit: false
+}))["prepareTimestamp"];
+
+// In another thread, perform an insert that also attempts to touch the `a: 2` update. This insert
+// will block until the above transaction commits or aborts. If the above transaction commits, this
+// insert will fail with a duplicate key. If the above transaction is aborted, this insert will
+// succeed.
+//
+// This insert will open up a hole in the oplog preventing writes from becoming majority
+// committed. In a properly behaving system, we will notice this resource being held while
+// entering a blocking call (prepare conflict resolution) and retry the transaction (which
+// releases the resource that prevents writes from becoming majority committed).
+const triggerPrepareConflictThread = new Thread(function(host, ns) {
+ const conn = new Mongo(host);
+ const collection = conn.getCollection(ns);
+ jsTestLog("Inserting a conflicting operation while keeping a hole open.");
+ assert.commandFailedWithCode(collection.insert([{a: 1}, {a: 2}, {a: 3}]),
+ ErrorCodes.DuplicateKey);
+}, primary.host, db[collName].getFullName());
+
+triggerPrepareConflictThread.start();
+
+// Wait for the insert to be in the system before attempting the majority write. Technically, this
+// is insufficient to prove we're properly exercising the code that detects a possible deadlock and
+// releases resources. In these cases, the test succeeds because the (yet to happen) majority write
+// occurs before the above thread creates a hole.
+assert.soon(() => {
+ const ops = primary.getDB("admin")
+ .aggregate([
+ {$currentOp: {allUsers: true}},
+ {
+ $match: {
+ type: "op",
+ ns: db[collName].getFullName(),
+ "command.insert": {$exists: true},
+ }
+ }
+ ])
+ .toArray();
+
+ if (ops.length === 0) {
+ return false;
+ }
+
+ assert.eq(ops.length, 1, ops);
+ return true;
+});
+
+// If the system is misbehaving, this write will fail to "majority replicate". As noted above, in a
+// single node replica set, an operation must be visible in the oplog before it can be considered
+// majority replicated.
+jsTestLog("Doing the majority write.");
+assert.soon(() => {
+ assert.commandWorked(db.bla.insert({}, {writeConcern: {w: "majority"}}));
+ return true;
+});
+
+// We could stop the test here, but by committing the transaction we can also assert that the
+// `triggerPrepareConflictThread` sees a `DuplicateKey` error.
+jsTestLog({"Committing. CommitTs": prepTs});
+assert.commandWorked(db.adminCommand({
+ commitTransaction: 1,
+ lsid,
+ txnNumber: NumberLong(1),
+ autocommit: false,
+ commitTimestamp: prepTs
+}));
+
+triggerPrepareConflictThread.join();
+
+rst.stopSet();
+})();
diff --git a/src/mongo/db/storage/recovery_unit.h b/src/mongo/db/storage/recovery_unit.h
index ff76570aa51..f8e43160ebe 100644
--- a/src/mongo/db/storage/recovery_unit.h
+++ b/src/mongo/db/storage/recovery_unit.h
@@ -279,6 +279,14 @@ public:
}
/**
+ * Returns true if a commit timestamp has been assigned to writes in this transaction.
+ * Otherwise, returns false.
+ */
+ virtual bool isTimestamped() const {
+ return false;
+ }
+
+ /**
* Sets a timestamp that will be assigned to all future writes on this RecoveryUnit until
* clearCommitTimestamp() is called. This must be called either outside of a WUOW or on a
* prepared transaction after setPrepareTimestamp() is called. setTimestamp() must not be called
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.cpp
index 4cb08f0aa03..a2603224e25 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.cpp
@@ -33,11 +33,19 @@
#include "mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.h"
+#include <mutex>
+
+#include "mongo/util/fail_point.h"
#include "mongo/util/fail_point_service.h"
#include "mongo/util/log.h"
+#include "mongo/util/stacktrace.h"
namespace mongo {
+namespace {
+std::once_flag logPrepareWithTimestampOnce;
+}
+
// When set, simulates WT_PREPARE_CONFLICT returned from WiredTiger API calls.
MONGO_FAIL_POINT_DEFINE(WTPrepareConflictForReads);
@@ -54,4 +62,11 @@ void wiredTigerPrepareConflictFailPointLog() {
log() << "WTPrintPrepareConflictLog fail point enabled.";
}
+void wiredTigerPrepareConflictOplogResourceLog() {
+ std::call_once(logPrepareWithTimestampOnce, [] {
+ log() << "Hit a prepare conflict while holding a resource on the oplog";
+ printStackTrace();
+ });
+}
+
} // namespace mongo
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.h b/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.h
index 9f57969cad7..3d3c361e98e 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.h
@@ -29,6 +29,7 @@
#pragma once
+#include "mongo/db/concurrency/write_conflict_exception.h"
#include "mongo/db/curop.h"
#include "mongo/db/prepare_conflict_tracker.h"
#include "mongo/db/storage/wiredtiger/wiredtiger_record_store.h"
@@ -55,6 +56,8 @@ void wiredTigerPrepareConflictLog(int attempt);
*/
void wiredTigerPrepareConflictFailPointLog();
+void wiredTigerPrepareConflictOplogResourceLog();
+
/**
* Runs the argument function f as many times as needed for f to return an error other than
* WT_PREPARE_CONFLICT. Each time f returns WT_PREPARE_CONFLICT we wait until the current unit of
@@ -77,6 +80,17 @@ int wiredTigerPrepareConflictRetry(OperationContext* opCtx, F&& f) {
if (ret != WT_PREPARE_CONFLICT)
return ret;
+ if (opCtx->recoveryUnit()->isTimestamped()) {
+ // This transaction is holding a resource in the form of an oplog slot. Committed
+ // transactions that get a later oplog slot will be unable to replicate until this resource
+ // is released (in the form of this transaction committing or aborting). For this case, we
+ // choose to abort our transaction and retry instead of blocking. It's possible that we can
+ // be blocking on a prepared update that requires replication to make progress, creating a
+ // stall in the MDB cluster.
+ wiredTigerPrepareConflictOplogResourceLog();
+ throw WriteConflictException();
+ }
+
PrepareConflictTracker::get(opCtx).beginPrepareConflict();
auto client = opCtx->getClient();
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
index 72f7d649729..6bbd13d079a 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
@@ -124,6 +124,10 @@ public:
Status setTimestamp(Timestamp timestamp) override;
+ bool isTimestamped() const override {
+ return _isTimestamped;
+ }
+
void setCommitTimestamp(Timestamp timestamp) override;
void clearCommitTimestamp() override;