diff options
-rw-r--r-- | jstests/libs/write_concern_util.js | 8 | ||||
-rw-r--r-- | jstests/replsets/retryable_write_concern.js | 10 | ||||
-rw-r--r-- | jstests/replsets/transient_txn_error_labels.js | 24 | ||||
-rw-r--r-- | jstests/replsets/transient_txn_error_labels_with_write_concern.js | 119 | ||||
-rw-r--r-- | src/mongo/db/service_entry_point_common.cpp | 44 |
5 files changed, 167 insertions, 38 deletions
diff --git a/jstests/libs/write_concern_util.js b/jstests/libs/write_concern_util.js index 4bbf6ad4d29..6c8abc35bfd 100644 --- a/jstests/libs/write_concern_util.js +++ b/jstests/libs/write_concern_util.js @@ -99,3 +99,11 @@ function runCommandCheckAdmin(db, cmd) { return db.runCommand(cmd.req); } } + +// Asserts that writeConcern timed out. +function checkWriteConcernTimedOut(res) { + assertWriteConcernError(res); + const errInfo = res.writeConcernError.errInfo; + assert(errInfo, "No writeConcernError errInfo, got: " + tojson(res)); + assert(errInfo.wtimeout, "No errInfo wtimeout, got: " + tojson(res)); +} diff --git a/jstests/replsets/retryable_write_concern.js b/jstests/replsets/retryable_write_concern.js index 719d5cda66c..60163db9685 100644 --- a/jstests/replsets/retryable_write_concern.js +++ b/jstests/replsets/retryable_write_concern.js @@ -17,16 +17,6 @@ const kNodes = 2; - let checkWriteConcernTimedOut = function(result) { - let wcErr = result.writeConcernError; - assert.neq(null, wcErr, tojson(result)); - - let errInfo = wcErr.errInfo; - assert.neq(null, errInfo, tojson(result)); - - assert.eq(true, errInfo.wtimeout, tojson(result)); - }; - /** * Tests that a retryable write properly waits for writeConcern on retry. Takes an optional * 'setupFunc' that sets up the database state. 'setupFunc' accepts a connection to the diff --git a/jstests/replsets/transient_txn_error_labels.js b/jstests/replsets/transient_txn_error_labels.js index 5ca84ae6cb7..ad1420a167e 100644 --- a/jstests/replsets/transient_txn_error_labels.js +++ b/jstests/replsets/transient_txn_error_labels.js @@ -21,15 +21,17 @@ const testColl = testDB.getCollection(collName); const sessionOptions = {causalConsistency: false}; - let session = secondary.startSession(sessionOptions); + let session = primary.startSession(sessionOptions); let sessionDb = session.getDatabase(dbName); let sessionColl = sessionDb.getCollection(collName); + let secondarySession = secondary.startSession(sessionOptions); + let secondarySessionDb = secondarySession.getDatabase(dbName); assert.commandWorked(testDB.createCollection(collName, {writeConcern: {w: "majority"}})); jsTest.log("Insert inside a transaction on secondary should fail but return error labels"); let txnNumber = 0; - let res = sessionDb.runCommand({ + let res = secondarySessionDb.runCommand({ insert: collName, documents: [{_id: "insert-1"}], readConcern: {level: "snapshot"}, @@ -43,26 +45,12 @@ jsTest.log("Insert outside a transaction on secondary should fail but not return error labels"); txnNumber++; // Insert as a retryable write. - res = sessionDb.runCommand( + res = secondarySessionDb.runCommand( {insert: collName, documents: [{_id: "insert-1"}], txnNumber: NumberLong(txnNumber)}); assert.commandFailedWithCode(res, ErrorCodes.NotMaster); assert(!res.hasOwnProperty("errorLabels")); - session.endSession(); - - jsTest.log("Write concern errors should not have error labels"); - // Start a new session on the primary. - session = primary.startSession(sessionOptions); - sessionDb = session.getDatabase(dbName); - sessionColl = sessionDb.getCollection(collName); - stopServerReplication(secondary); - session.startTransaction({writeConcern: {w: "majority", wtimeout: 1}}); - assert.commandWorked(sessionColl.insert({_id: "write-with-write-concern"})); - res = session.commitTransaction_forTesting(); - assert.eq(res.writeConcernError.code, ErrorCodes.WriteConcernFailed); - assert(!res.hasOwnProperty("code")); - assert(!res.hasOwnProperty("errorLabels")); - restartServerReplication(secondary); + secondarySession.endSession(); jsTest.log("failCommand should be able to return errors with TransientTransactionError"); assert.commandWorked(testDB.adminCommand({ diff --git a/jstests/replsets/transient_txn_error_labels_with_write_concern.js b/jstests/replsets/transient_txn_error_labels_with_write_concern.js new file mode 100644 index 00000000000..5c16667e7c4 --- /dev/null +++ b/jstests/replsets/transient_txn_error_labels_with_write_concern.js @@ -0,0 +1,119 @@ +// Test TransientTransactionError error label for commands in transactions with write concern. +// @tags: [uses_transactions] +(function() { + "use strict"; + + load("jstests/libs/write_concern_util.js"); + load("jstests/replsets/rslib.js"); + + const dbName = "test"; + const collName = "transient_txn_error_labels_with_write_concern"; + const rst = new ReplSetTest({name: collName, nodes: 3}); + const config = rst.getReplSetConfig(); + config.members[2].priority = 0; + config.settings = {}; + // Disable catchup so the new primary will not sync from the old one. + config.settings.catchUpTimeoutMillis = 0; + // Disable catchup takeover to prevent the old primary to take over the new one. + config.settings.catchUpTakeoverDelayMillis = -1; + rst.startSet(); + rst.initiate(config); + + const primary = rst.getPrimary(); + const secondary = rst.getSecondary(); + assert.eq(primary, rst.nodes[0]); + const testDB = primary.getDB(dbName); + + const sessionOptions = {causalConsistency: false}; + const writeConcernMajority = {w: "majority", wtimeout: 500}; + + assert.commandWorked(testDB.createCollection(collName, {writeConcern: {w: "majority"}})); + + jsTest.log("Write concern errors should not have error labels"); + // Start a new session on the primary. + let session = primary.startSession(sessionOptions); + let sessionDb = session.getDatabase(dbName); + let sessionColl = sessionDb.getCollection(collName); + stopServerReplication(rst.getSecondaries()); + session.startTransaction({writeConcern: writeConcernMajority}); + assert.commandWorked(sessionColl.insert({_id: "write-with-write-concern"})); + let res = session.commitTransaction_forTesting(); + checkWriteConcernTimedOut(res); + assert(!res.hasOwnProperty("code")); + assert(!res.hasOwnProperty("errorLabels")); + restartServerReplication(rst.getSecondaries()); + + jsTest.log( + "commitTransaction should wait for write concern even if it returns NoSuchTransaction"); + // Make sure the new primary only miss the commitTransaction sent in this test case. + rst.awaitReplication(); + + session.startTransaction(writeConcernMajority); + // Pick up a high enough txnNumber so that it doesn't conflict with previous test cases. + let txnNumber = 20; + assert.commandWorked(sessionDb.runCommand({ + insert: collName, + documents: [{_id: "commitTransaction-with-write-concern"}], + readConcern: {level: "snapshot"}, + txnNumber: NumberLong(txnNumber), + startTransaction: true, + autocommit: false + })); + const oldPrimary = rst.getPrimary(); + // Stop replication on all nodes, including the old primary so that it won't replicate from the + // new primary. + stopServerReplication(rst.nodes); + // commitTransaction fails on the old primary. + res = sessionDb.adminCommand({ + commitTransaction: 1, + txnNumber: NumberLong(txnNumber), + autocommit: false, + writeConcern: writeConcernMajority, + }); + checkWriteConcernTimedOut(res); + + // Failover happens, but the new secondary cannot replicate anything to others. + assert.commandWorked(secondary.adminCommand({replSetStepUp: 1})); + rst.awaitNodesAgreeOnPrimary(); + reconnect(secondary); + // Restart replication on the new primary, so it can become "master". + restartServerReplication(secondary); + const newPrimary = rst.getPrimary(); + assert.neq(oldPrimary, newPrimary); + const newPrimarySession = newPrimary.startSession(sessionOptions); + // Force the new session to use the old session id to simulate driver's behavior. + const overridenSessionId = newPrimarySession._serverSession.handle.getId(); + const lsid = session._serverSession.handle.getId(); + jsTest.log("Overriding sessionID " + tojson(overridenSessionId) + " with " + tojson(lsid)); + newPrimarySession._serverSession.handle.getId = () => lsid; + const newPrimarySessionDb = newPrimarySession.getDatabase(dbName); + + res = newPrimarySessionDb.adminCommand({ + commitTransaction: 1, + txnNumber: NumberLong(txnNumber), + autocommit: false, + writeConcern: writeConcernMajority, + }); + checkWriteConcernTimedOut(res); + assert.commandFailedWithCode(res, ErrorCodes.NoSuchTransaction); + + jsTest.log("NoSuchTransaction with write concern error is not transient"); + assert(!res.hasOwnProperty("errorLabels")); + + jsTest.log("NoSuchTransaction without write concern error is transient"); + restartServerReplication(rst.nodes); + res = newPrimarySessionDb.adminCommand({ + commitTransaction: 1, + txnNumber: NumberLong(txnNumber), + autocommit: false, + writeConcern: {w: "majority"}, // Wait with a long timeout. + }); + assert.commandFailedWithCode(res, ErrorCodes.NoSuchTransaction); + assert(!res.hasOwnProperty("writeConcernError"), res); + assert.eq(res["errorLabels"], ["TransientTransactionError"], res); + + rst.awaitNodesAgreeOnPrimary(); + session.endSession(); + + rst.stopSet(); +}()); diff --git a/src/mongo/db/service_entry_point_common.cpp b/src/mongo/db/service_entry_point_common.cpp index c4180d864e6..ff715c7f72e 100644 --- a/src/mongo/db/service_entry_point_common.cpp +++ b/src/mongo/db/service_entry_point_common.cpp @@ -221,20 +221,30 @@ void generateErrorResponse(OperationContext* opCtx, BSONObj getErrorLabels(const OperationSessionInfoFromClient& sessionOptions, const std::string& commandName, - ErrorCodes::Error code) { + ErrorCodes::Error code, + bool hasWriteConcernError) { + // By specifying "autocommit", the user indicates they want to run a transaction. if (!sessionOptions.getAutocommit()) { return {}; } - bool isRetryable = ErrorCodes::isNotMasterError(code) || ErrorCodes::isShutdownError(code); + // The errors that indicate the transaction fails without any persistent side-effect. bool isTransientTransactionError = code == ErrorCodes::WriteConflict // || code == ErrorCodes::SnapshotUnavailable // - || code == ErrorCodes::NoSuchTransaction // - || code == ErrorCodes::LockTimeout // - // Clients can retry a single commitTransaction command, but cannot retry the whole - // transaction if commitTransaction fails due to NotMaster. - || (isRetryable && (commandName != "commitTransaction")); + || code == ErrorCodes::LockTimeout; + + if (commandName == "commitTransaction") { + // NoSuchTransaction is determined based on the data. It's safe to retry the whole + // transaction, only if the data cannot be rolled back. + isTransientTransactionError |= + code == ErrorCodes::NoSuchTransaction && !hasWriteConcernError; + } else { + bool isRetryable = ErrorCodes::isNotMasterError(code) || ErrorCodes::isShutdownError(code); + // For commands other than "commitTransaction", we know there's no side-effect for these + // errors, but it's not true for "commitTransaction" if a failover happens. + isTransientTransactionError |= isRetryable || code == ErrorCodes::NoSuchTransaction; + } if (isTransientTransactionError) { return BSON("errorLabels" << BSON_ARRAY("TransientTransactionError")); @@ -467,7 +477,7 @@ void invokeInTransaction(OperationContext* opCtx, CommandInvocation* invocation, const OpMsgRequest& request, const OperationSessionInfoFromClient& sessionOptions, - CommandReplyBuilder* replyBuilder) { + CommandReplyBuilder* replyBuilder) try { auto session = OperationContextSession::get(opCtx); invariant(session); invariant(opCtx->getTxnNumber() || opCtx->getClient()->isInDirectClient()); @@ -495,6 +505,15 @@ void invokeInTransaction(OperationContext* opCtx, // Stash or commit the transaction when the command succeeds. session->stashTransactionResources(opCtx); guard.Dismiss(); +} catch (const ExceptionFor<ErrorCodes::NoSuchTransaction>&) { + // We make our decision about the transaction state based on the oplog we have, so + // we set the client last op to the last optime observed by the system to ensure that + // we wait for the specified write concern on an optime greater than or equal to the + // the optime of our decision basis. Thus we know our decision basis won't be rolled + // back. + auto& replClient = repl::ReplClientInfo::forClient(opCtx->getClient()); + replClient.setLastOpToSystemLastOpTime(opCtx); + throw; } bool runCommandImpl(OperationContext* opCtx, @@ -590,7 +609,9 @@ bool runCommandImpl(OperationContext* opCtx, if (codeField.isNumber()) { auto code = ErrorCodes::Error(codeField.numberInt()); // Append the error labels for transient transaction errors. - auto errorLabels = getErrorLabels(sessionOptions, command->getName(), code); + const auto hasWriteConcern = response.hasField("writeConcernError"); + auto errorLabels = + getErrorLabels(sessionOptions, command->getName(), code, hasWriteConcern); crb.getBodyBuilder().appendElements(errorLabels); } } @@ -911,7 +932,10 @@ void execCommandDatabase(OperationContext* opCtx, behaviors.handleException(e, opCtx); // Append the error labels for transient transaction errors. - auto errorLabels = getErrorLabels(sessionOptions, command->getName(), e.code()); + auto response = extraFieldsBuilder.asTempObj(); + auto hasWriteConcern = response.hasField("writeConcernError"); + auto errorLabels = + getErrorLabels(sessionOptions, command->getName(), e.code(), hasWriteConcern); extraFieldsBuilder.appendElements(errorLabels); BSONObjBuilder metadataBob; |