From 1e7e343fb6c90fbf0c62deabf61630353e2e5e29 Mon Sep 17 00:00:00 2001 From: Daniel Gottlieb Date: Thu, 10 Jun 2021 09:25:55 -0400 Subject: SERVER-57476: Return a WriteConflict when a timestamped transaction hits a prepare conflict. --- etc/backports_required_for_multiversion_tests.yml | 2 + .../assert_on_prepare_conflict_with_hole.js | 136 +++++++++++++++++++++ .../wiredtiger/wiredtiger_prepare_conflict.cpp | 14 +++ .../wiredtiger/wiredtiger_prepare_conflict.h | 14 +++ 4 files changed, 166 insertions(+) create mode 100644 jstests/replsets/assert_on_prepare_conflict_with_hole.js diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index bbe7db5487b..b57f0b2b92f 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -156,6 +156,8 @@ all: test_file: jstests/concurrency/fsm_workloads/findAndModify_flip_location.js - ticket: SERVER-56518 test_file: jstests/sharding/txn_writes_during_movechunk.js + - ticket: SERVER-57476 + test_file: jstests/replsets/assert_on_prepare_conflict_with_hole.js # Tests that should only be excluded from particular suites should be listed under that suite. suites: diff --git a/jstests/replsets/assert_on_prepare_conflict_with_hole.js b/jstests/replsets/assert_on_prepare_conflict_with_hole.js new file mode 100644 index 00000000000..b3379ff8b9d --- /dev/null +++ b/jstests/replsets/assert_on_prepare_conflict_with_hole.js @@ -0,0 +1,136 @@ +/** + * Constructs the following cycle that can lead to stalling a sharded cluster: + * | Preparer | Insert | OplogVisibility Ts | + * |---------------------------------------+---------------------------+--------------------| + * | BeginTxn | | | + * | Write A | | | + * | | BeginTxn | | + * | | Preallocates TS(10) | 9 | + * | (side txn commits prepare oplog @ 11) | | | + * | Prepare 11 | | | + * | | Write A (PrepareConflict) | | + * + * In this scenario, the prepared transaction blocks waiting for its prepare oplog entry at + * timestamp 11 to become majority committed. However, the prepare oplog entry cannot replicate to + * secondaries until the oplog visibility timestamp advances to 11. The oplog visibility timestamp + * advancing is blocked on the insert that allocated timestamps 10. The insert cannot make progress + * because it has hit a prepare conflict. The prepare conflict this test specifically exercises is + * for duplicate key detection on a non-_id unique index. + * + * @tags: [uses_transactions, uses_prepare_transaction] + */ +(function() { +"use strict"; + +load("jstests/libs/parallelTester.js"); + +// Use a single node replica set for simplicity. Note that an oplog hole on a single node replica +// will block new writes from becoming majority committed. +const rst = new ReplSetTest({ + nodes: 1, + nodeOptions: { + setParameter: {logComponentVerbosity: tojson({storage: 1})}, + } +}); +rst.startSet(); +rst.initiate(); + +const primary = rst.getPrimary(); +assert.commandWorked(primary.adminCommand( + {setDefaultRWConcern: 1, defaultWriteConcern: {w: 1}, writeConcern: {w: "majority"}})); +const db = primary.getDB("test"); + +const collName = "mycoll"; +assert.commandWorked(db.runCommand({create: collName, writeConcern: {w: "majority"}})); +// A secondary unique index requires cursor positioning in WT which can result in hitting a prepare +// conflict. +assert.commandWorked(db[collName].createIndex({a: 1}, {unique: true})); + +// Start a multi-document transaction that inserts an `a: 2` update. +const lsid = ({id: UUID()}); +assert.commandWorked(db.runCommand({ + insert: collName, + documents: [{a: 2}], + lsid, + txnNumber: NumberLong(1), + autocommit: false, + startTransaction: true, +})); + +// Prepare the `a: 2` update. +let prepTs = assert.commandWorked(db.adminCommand({ + prepareTransaction: 1, + lsid, + txnNumber: NumberLong(1), + autocommit: false +}))["prepareTimestamp"]; + +// In another thread, perform an insert that also attempts to touch the `a: 2` update. This insert +// will block until the above transaction commits or aborts. If the above transaction commits, this +// insert will fail with a duplicate key. If the above transaction is aborted, this insert will +// succeed. +// +// This insert will open up a hole in the oplog preventing writes from becoming majority +// committed. In a properly behaving system, we will notice this resource being held while +// entering a blocking call (prepare conflict resolution) and retry the transaction (which +// releases the resource that prevents writes from becoming majority committed). +const triggerPrepareConflictThread = new Thread(function(host, ns) { + const conn = new Mongo(host); + const collection = conn.getCollection(ns); + jsTestLog("Inserting a conflicting operation while keeping a hole open."); + assert.commandFailedWithCode(collection.insert([{a: 1}, {a: 2}, {a: 3}]), + ErrorCodes.DuplicateKey); +}, primary.host, db[collName].getFullName()); + +triggerPrepareConflictThread.start(); + +// Wait for the insert to be in the system before attempting the majority write. Technically, this +// is insufficient to prove we're properly exercising the code that detects a possible deadlock and +// releases resources. In these cases, the test succeeds because the (yet to happen) majority write +// occurs before the above thread creates a hole. +assert.soon(() => { + const ops = primary.getDB("admin") + .aggregate([ + {$currentOp: {allUsers: true}}, + { + $match: { + type: "op", + ns: db[collName].getFullName(), + "command.insert": {$exists: true}, + } + } + ]) + .toArray(); + + if (ops.length === 0) { + return false; + } + + assert.eq(ops.length, 1, ops); + return true; +}); + +// If the system is misbehaving, this write will fail to "majority replicate". As noted above, in a +// single node replica set, an operation must be visible in the oplog before it can be considered +// majority replicated. +jsTestLog("Doing the majority write."); +assert.soon(() => { + assert.commandWorked(db.bla.insert({}, {writeConcern: {w: "majority"}})); + return true; +}); + +// We could stop the test here, but by committing the transaction we can also assert that the +// `triggerPrepareConflictThread` sees a `DuplicateKey` error. +jsTestLog({"Committing. CommitTs": prepTs}); +assert.commandWorked(db.adminCommand({ + commitTransaction: 1, + lsid, + txnNumber: NumberLong(1), + autocommit: false, + commitTimestamp: prepTs +})); + +triggerPrepareConflictThread.join(); + +rst.stopSet(); +})(); diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.cpp index b4d49c5e5d7..2fb1ccbdc0e 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.cpp @@ -33,11 +33,18 @@ #include "mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.h" +#include + #include "mongo/logv2/log.h" #include "mongo/util/fail_point.h" +#include "mongo/util/stacktrace.h" namespace mongo { +namespace { +std::once_flag logPrepareWithTimestampOnce; +} + // When set, simulates WT_PREPARE_CONFLICT returned from WiredTiger API calls. MONGO_FAIL_POINT_DEFINE(WTPrepareConflictForReads); @@ -57,4 +64,11 @@ void wiredTigerPrepareConflictFailPointLog() { LOGV2(22380, "WTPrintPrepareConflictLog fail point enabled."); } +void wiredTigerPrepareConflictOplogResourceLog() { + std::call_once(logPrepareWithTimestampOnce, [] { + LOGV2(5739901, "Hit a prepare conflict while holding a resource on the oplog"); + printStackTrace(); + }); +} + } // namespace mongo diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.h b/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.h index c297332913c..28cc3304412 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_prepare_conflict.h @@ -29,6 +29,7 @@ #pragma once +#include "mongo/db/concurrency/write_conflict_exception.h" #include "mongo/db/curop.h" #include "mongo/db/prepare_conflict_tracker.h" #include "mongo/db/storage/wiredtiger/wiredtiger_record_store.h" @@ -55,6 +56,8 @@ void wiredTigerPrepareConflictLog(int attempt); */ void wiredTigerPrepareConflictFailPointLog(); +void wiredTigerPrepareConflictOplogResourceLog(); + /** * Runs the argument function f as many times as needed for f to return an error other than * WT_PREPARE_CONFLICT. Each time f returns WT_PREPARE_CONFLICT we wait until the current unit of @@ -72,6 +75,17 @@ int wiredTigerPrepareConflictRetry(OperationContext* opCtx, F&& f) { if (ret != WT_PREPARE_CONFLICT) return ret; + if (opCtx->recoveryUnit()->isTimestamped()) { + // This transaction is holding a resource in the form of an oplog slot. Committed + // transactions that get a later oplog slot will be unable to replicate until this resource + // is released (in the form of this transaction committing or aborting). For this case, we + // choose to abort our transaction and retry instead of blocking. It's possible that we can + // be blocking on a prepared update that requires replication to make progress, creating a + // stall in the MDB cluster. + wiredTigerPrepareConflictOplogResourceLog(); + throw WriteConflictException(); + } + auto recoveryUnit = WiredTigerRecoveryUnit::get(opCtx); int attempts = 1; // If we return from this function, we have either returned successfully or we've returned an -- cgit v1.2.1