diff options
author | Daniel Gottlieb <daniel.gottlieb@mongodb.com> | 2021-06-10 09:25:55 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-06-11 00:40:55 +0000 |
commit | 379db12e4d046c82f60732bcf11c98019f04bbf5 (patch) | |
tree | 219a52f6975da31c1a9a5cf3f36e18dc6099a9db | |
parent | 47f1fccaf3db0116b5f6e719651c35313a6d89fe (diff) | |
download | mongo-379db12e4d046c82f60732bcf11c98019f04bbf5.tar.gz |
SERVER-57476: Return a WriteConflict when a timestamped transaction hits a prepare conflict.
(cherry picked from commit 1e7e343fb6c90fbf0c62deabf61630353e2e5e29)
5 files changed, 175 insertions, 0 deletions
diff --git a/jstests/replsets/assert_on_prepare_conflict_with_hole.js b/jstests/replsets/assert_on_prepare_conflict_with_hole.js new file mode 100644 index 00000000000..9138eaf324c --- /dev/null +++ b/jstests/replsets/assert_on_prepare_conflict_with_hole.js @@ -0,0 +1,134 @@ +/** + * Constructs the following cycle that can lead to stalling a sharded cluster: + * | Preparer | Insert | OplogVisibility Ts | + * |---------------------------------------+---------------------------+--------------------| + * | BeginTxn | | | + * | Write A | | | + * | | BeginTxn | | + * | | Preallocates TS(10) | 9 | + * | (side txn commits prepare oplog @ 11) | | | + * | Prepare 11 | | | + * | | Write A (PrepareConflict) | | + * + * In this scenario, the prepared transaction blocks waiting for its prepare oplog entry at + * timestamp 11 to become majority committed. However, the prepare oplog entry cannot replicate to + * secondaries until the oplog visibility timestamp advances to 11. The oplog visibility timestamp + * advancing is blocked on the insert that allocated timestamps 10. The insert cannot make progress + * because it has hit a prepare conflict. The prepare conflict this test specifically exercises is + * for duplicate key detection on a non-_id unique index. + * + * @tags: [multiversion_incompatible, uses_transactions, uses_prepare_transaction] + */ +(function() { +"use strict"; + +load("jstests/libs/parallelTester.js"); + +// Use a single node replica set for simplicity. Note that an oplog hole on a single node replica +// will block new writes from becoming majority committed. +const rst = new ReplSetTest({ + nodes: 1, + nodeOptions: { + setParameter: {logComponentVerbosity: tojson({storage: 1})}, + } +}); +rst.startSet(); +rst.initiate(); + +const primary = rst.getPrimary(); +const db = primary.getDB("test"); + +const collName = "mycoll"; +assert.commandWorked(db.runCommand({create: collName, writeConcern: {w: "majority"}})); +// A secondary unique index requires cursor positioning in WT which can result in hitting a prepare +// conflict. +assert.commandWorked(db[collName].createIndex({a: 1}, {unique: true})); + +// Start a multi-document transaction that inserts an `a: 2` update. +const lsid = ({id: UUID()}); +assert.commandWorked(db.runCommand({ + insert: collName, + documents: [{a: 2}], + lsid, + txnNumber: NumberLong(1), + autocommit: false, + startTransaction: true, +})); + +// Prepare the `a: 2` update. +let prepTs = assert.commandWorked(db.adminCommand({ + prepareTransaction: 1, + lsid, + txnNumber: NumberLong(1), + autocommit: false +}))["prepareTimestamp"]; + +// In another thread, perform an insert that also attempts to touch the `a: 2` update. This insert +// will block until the above transaction commits or aborts. If the above transaction commits, this +// insert will fail with a duplicate key. If the above transaction is aborted, this insert will +// succeed. +// +// This insert will open up a hole in the oplog preventing writes from becoming majority +// committed. In a properly behaving system, we will notice this resource being held while +// entering a blocking call (prepare conflict resolution) and retry the transaction (which +// releases the resource that prevents writes from becoming majority committed). +const triggerPrepareConflictThread = new Thread(function(host, ns) { + const conn = new Mongo(host); + const collection = conn.getCollection(ns); + jsTestLog("Inserting a conflicting operation while keeping a hole open."); + assert.commandFailedWithCode(collection.insert([{a: 1}, {a: 2}, {a: 3}]), + ErrorCodes.DuplicateKey); +}, primary.host, db[collName].getFullName()); + +triggerPrepareConflictThread.start(); + +// Wait for the insert to be in the system before attempting the majority write. Technically, this +// is insufficient to prove we're properly exercising the code that detects a possible deadlock and +// releases resources. In these cases, the test succeeds because the (yet to happen) majority write +// occurs before the above thread creates a hole. +assert.soon(() => { + const ops = primary.getDB("admin") + .aggregate([ + {$currentOp: {allUsers: true}}, + { + $match: { + type: "op", + ns: db[collName].getFullName(), + "command.insert": {$exists: true}, + } + } + ]) + .toArray(); + + if (ops.length === 0) { + return false; + } + + assert.eq(ops.length, 1, ops); + return true; +}); + +// If the system is misbehaving, this write will fail to "majority replicate". As noted above, in a +// single node replica set, an operation must be visible in the oplog before it can be considered +// majority replicated. +jsTestLog("Doing the majority write."); +assert.soon(() => { + assert.commandWorked(db.bla.insert({}, {writeConcern: {w: "majority"}})); + return true; +}); + +// We could stop the test here, but by committing the transaction we can also assert that the +// `triggerPrepareConflictThread` sees a `DuplicateKey` error. +jsTestLog({"Committing. CommitTs": prepTs}); +assert.commandWorked(db.adminCommand({ + commitTransaction: 1, + lsid, + txnNumber: NumberLong(1), + autocommit: false, + commitTimestamp: prepTs +})); + +triggerPrepareConflictThread.join(); + +rst.stopSet(); +})(); diff --git a/src/mongo/db/storage/recovery_unit.h b/src/mongo/db/storage/recovery_unit.h index ff76570aa51..f8e43160ebe 100644 --- a/src/mongo/db/storage/recovery_unit.h +++ b/src/mongo/db/storage/recovery_unit.h @@ -279,6 +279,14 @@ public: } /** + * Returns true if a commit timestamp has been assigned to writes in this transaction. + * Otherwise, returns false. + */ + virtual bool isTimestamped() const { + return false; + } + + /** * Sets a timestamp that will be assigned to all future writes on this RecoveryUnit until * clearCommitTimestamp() is called. This must be called either outside of a WUOW or on a * prepared transaction after setPrepareTimestamp() is called. setTimestamp() must not be called diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.cpp index 4cb08f0aa03..a2603224e25 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.cpp @@ -33,11 +33,19 @@ #include "mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.h" +#include <mutex> + +#include "mongo/util/fail_point.h" #include "mongo/util/fail_point_service.h" #include "mongo/util/log.h" +#include "mongo/util/stacktrace.h" namespace mongo { +namespace { +std::once_flag logPrepareWithTimestampOnce; +} + // When set, simulates WT_PREPARE_CONFLICT returned from WiredTiger API calls. MONGO_FAIL_POINT_DEFINE(WTPrepareConflictForReads); @@ -54,4 +62,11 @@ void wiredTigerPrepareConflictFailPointLog() { log() << "WTPrintPrepareConflictLog fail point enabled."; } +void wiredTigerPrepareConflictOplogResourceLog() { + std::call_once(logPrepareWithTimestampOnce, [] { + log() << "Hit a prepare conflict while holding a resource on the oplog"; + printStackTrace(); + }); +} + } // namespace mongo diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.h b/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.h index 9f57969cad7..3d3c361e98e 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.h @@ -29,6 +29,7 @@ #pragma once +#include "mongo/db/concurrency/write_conflict_exception.h" #include "mongo/db/curop.h" #include "mongo/db/prepare_conflict_tracker.h" #include "mongo/db/storage/wiredtiger/wiredtiger_record_store.h" @@ -55,6 +56,8 @@ void wiredTigerPrepareConflictLog(int attempt); */ void wiredTigerPrepareConflictFailPointLog(); +void wiredTigerPrepareConflictOplogResourceLog(); + /** * Runs the argument function f as many times as needed for f to return an error other than * WT_PREPARE_CONFLICT. Each time f returns WT_PREPARE_CONFLICT we wait until the current unit of @@ -77,6 +80,17 @@ int wiredTigerPrepareConflictRetry(OperationContext* opCtx, F&& f) { if (ret != WT_PREPARE_CONFLICT) return ret; + if (opCtx->recoveryUnit()->isTimestamped()) { + // This transaction is holding a resource in the form of an oplog slot. Committed + // transactions that get a later oplog slot will be unable to replicate until this resource + // is released (in the form of this transaction committing or aborting). For this case, we + // choose to abort our transaction and retry instead of blocking. It's possible that we can + // be blocking on a prepared update that requires replication to make progress, creating a + // stall in the MDB cluster. + wiredTigerPrepareConflictOplogResourceLog(); + throw WriteConflictException(); + } + PrepareConflictTracker::get(opCtx).beginPrepareConflict(); auto client = opCtx->getClient(); diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h index 72f7d649729..6bbd13d079a 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h @@ -124,6 +124,10 @@ public: Status setTimestamp(Timestamp timestamp) override; + bool isTimestamped() const override { + return _isTimestamped; + } + void setCommitTimestamp(Timestamp timestamp) override; void clearCommitTimestamp() override; |