summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiyuan Zhou <siyuan.zhou@mongodb.com>2018-10-10 19:41:00 -0400
committerSiyuan Zhou <siyuan.zhou@mongodb.com>2019-03-05 14:57:55 -0500
commit38896cab6a0aad9bca1f9e9f5de65ea0cdc62d22 (patch)
tree667cc6fec5317dc88c007d69f51f9ec62446cd47
parent85f3a786bb5d33d74c88958214321143b4ea610e (diff)
downloadmongo-38896cab6a0aad9bca1f9e9f5de65ea0cdc62d22.tar.gz
SERVER-37179 Wait for specified write concern whenever commitTransaction returns a NoSuchTransaction error
(cherry picked from commit 6394bfafd5c42bfeb01b6686498d7fff697d9480)
-rw-r--r--jstests/libs/write_concern_util.js8
-rw-r--r--jstests/replsets/retryable_write_concern.js10
-rw-r--r--jstests/replsets/transient_txn_error_labels.js24
-rw-r--r--jstests/replsets/transient_txn_error_labels_with_write_concern.js119
-rw-r--r--src/mongo/db/service_entry_point_common.cpp44
5 files changed, 167 insertions, 38 deletions
diff --git a/jstests/libs/write_concern_util.js b/jstests/libs/write_concern_util.js
index 4bbf6ad4d29..6c8abc35bfd 100644
--- a/jstests/libs/write_concern_util.js
+++ b/jstests/libs/write_concern_util.js
@@ -99,3 +99,11 @@ function runCommandCheckAdmin(db, cmd) {
return db.runCommand(cmd.req);
}
}
+
+// Asserts that writeConcern timed out.
+function checkWriteConcernTimedOut(res) {
+ assertWriteConcernError(res);
+ const errInfo = res.writeConcernError.errInfo;
+ assert(errInfo, "No writeConcernError errInfo, got: " + tojson(res));
+ assert(errInfo.wtimeout, "No errInfo wtimeout, got: " + tojson(res));
+}
diff --git a/jstests/replsets/retryable_write_concern.js b/jstests/replsets/retryable_write_concern.js
index 719d5cda66c..60163db9685 100644
--- a/jstests/replsets/retryable_write_concern.js
+++ b/jstests/replsets/retryable_write_concern.js
@@ -17,16 +17,6 @@
const kNodes = 2;
- let checkWriteConcernTimedOut = function(result) {
- let wcErr = result.writeConcernError;
- assert.neq(null, wcErr, tojson(result));
-
- let errInfo = wcErr.errInfo;
- assert.neq(null, errInfo, tojson(result));
-
- assert.eq(true, errInfo.wtimeout, tojson(result));
- };
-
/**
* Tests that a retryable write properly waits for writeConcern on retry. Takes an optional
* 'setupFunc' that sets up the database state. 'setupFunc' accepts a connection to the
diff --git a/jstests/replsets/transient_txn_error_labels.js b/jstests/replsets/transient_txn_error_labels.js
index 5ca84ae6cb7..ad1420a167e 100644
--- a/jstests/replsets/transient_txn_error_labels.js
+++ b/jstests/replsets/transient_txn_error_labels.js
@@ -21,15 +21,17 @@
const testColl = testDB.getCollection(collName);
const sessionOptions = {causalConsistency: false};
- let session = secondary.startSession(sessionOptions);
+ let session = primary.startSession(sessionOptions);
let sessionDb = session.getDatabase(dbName);
let sessionColl = sessionDb.getCollection(collName);
+ let secondarySession = secondary.startSession(sessionOptions);
+ let secondarySessionDb = secondarySession.getDatabase(dbName);
assert.commandWorked(testDB.createCollection(collName, {writeConcern: {w: "majority"}}));
jsTest.log("Insert inside a transaction on secondary should fail but return error labels");
let txnNumber = 0;
- let res = sessionDb.runCommand({
+ let res = secondarySessionDb.runCommand({
insert: collName,
documents: [{_id: "insert-1"}],
readConcern: {level: "snapshot"},
@@ -43,26 +45,12 @@
jsTest.log("Insert outside a transaction on secondary should fail but not return error labels");
txnNumber++;
// Insert as a retryable write.
- res = sessionDb.runCommand(
+ res = secondarySessionDb.runCommand(
{insert: collName, documents: [{_id: "insert-1"}], txnNumber: NumberLong(txnNumber)});
assert.commandFailedWithCode(res, ErrorCodes.NotMaster);
assert(!res.hasOwnProperty("errorLabels"));
- session.endSession();
-
- jsTest.log("Write concern errors should not have error labels");
- // Start a new session on the primary.
- session = primary.startSession(sessionOptions);
- sessionDb = session.getDatabase(dbName);
- sessionColl = sessionDb.getCollection(collName);
- stopServerReplication(secondary);
- session.startTransaction({writeConcern: {w: "majority", wtimeout: 1}});
- assert.commandWorked(sessionColl.insert({_id: "write-with-write-concern"}));
- res = session.commitTransaction_forTesting();
- assert.eq(res.writeConcernError.code, ErrorCodes.WriteConcernFailed);
- assert(!res.hasOwnProperty("code"));
- assert(!res.hasOwnProperty("errorLabels"));
- restartServerReplication(secondary);
+ secondarySession.endSession();
jsTest.log("failCommand should be able to return errors with TransientTransactionError");
assert.commandWorked(testDB.adminCommand({
diff --git a/jstests/replsets/transient_txn_error_labels_with_write_concern.js b/jstests/replsets/transient_txn_error_labels_with_write_concern.js
new file mode 100644
index 00000000000..5c16667e7c4
--- /dev/null
+++ b/jstests/replsets/transient_txn_error_labels_with_write_concern.js
@@ -0,0 +1,119 @@
+// Test TransientTransactionError error label for commands in transactions with write concern.
+// @tags: [uses_transactions]
+(function() {
+ "use strict";
+
+ load("jstests/libs/write_concern_util.js");
+ load("jstests/replsets/rslib.js");
+
+ const dbName = "test";
+ const collName = "transient_txn_error_labels_with_write_concern";
+ const rst = new ReplSetTest({name: collName, nodes: 3});
+ const config = rst.getReplSetConfig();
+ config.members[2].priority = 0;
+ config.settings = {};
+ // Disable catchup so the new primary will not sync from the old one.
+ config.settings.catchUpTimeoutMillis = 0;
+ // Disable catchup takeover to prevent the old primary to take over the new one.
+ config.settings.catchUpTakeoverDelayMillis = -1;
+ rst.startSet();
+ rst.initiate(config);
+
+ const primary = rst.getPrimary();
+ const secondary = rst.getSecondary();
+ assert.eq(primary, rst.nodes[0]);
+ const testDB = primary.getDB(dbName);
+
+ const sessionOptions = {causalConsistency: false};
+ const writeConcernMajority = {w: "majority", wtimeout: 500};
+
+ assert.commandWorked(testDB.createCollection(collName, {writeConcern: {w: "majority"}}));
+
+ jsTest.log("Write concern errors should not have error labels");
+ // Start a new session on the primary.
+ let session = primary.startSession(sessionOptions);
+ let sessionDb = session.getDatabase(dbName);
+ let sessionColl = sessionDb.getCollection(collName);
+ stopServerReplication(rst.getSecondaries());
+ session.startTransaction({writeConcern: writeConcernMajority});
+ assert.commandWorked(sessionColl.insert({_id: "write-with-write-concern"}));
+ let res = session.commitTransaction_forTesting();
+ checkWriteConcernTimedOut(res);
+ assert(!res.hasOwnProperty("code"));
+ assert(!res.hasOwnProperty("errorLabels"));
+ restartServerReplication(rst.getSecondaries());
+
+ jsTest.log(
+ "commitTransaction should wait for write concern even if it returns NoSuchTransaction");
+ // Make sure the new primary only miss the commitTransaction sent in this test case.
+ rst.awaitReplication();
+
+ session.startTransaction(writeConcernMajority);
+ // Pick up a high enough txnNumber so that it doesn't conflict with previous test cases.
+ let txnNumber = 20;
+ assert.commandWorked(sessionDb.runCommand({
+ insert: collName,
+ documents: [{_id: "commitTransaction-with-write-concern"}],
+ readConcern: {level: "snapshot"},
+ txnNumber: NumberLong(txnNumber),
+ startTransaction: true,
+ autocommit: false
+ }));
+ const oldPrimary = rst.getPrimary();
+ // Stop replication on all nodes, including the old primary so that it won't replicate from the
+ // new primary.
+ stopServerReplication(rst.nodes);
+ // commitTransaction fails on the old primary.
+ res = sessionDb.adminCommand({
+ commitTransaction: 1,
+ txnNumber: NumberLong(txnNumber),
+ autocommit: false,
+ writeConcern: writeConcernMajority,
+ });
+ checkWriteConcernTimedOut(res);
+
+ // Failover happens, but the new secondary cannot replicate anything to others.
+ assert.commandWorked(secondary.adminCommand({replSetStepUp: 1}));
+ rst.awaitNodesAgreeOnPrimary();
+ reconnect(secondary);
+ // Restart replication on the new primary, so it can become "master".
+ restartServerReplication(secondary);
+ const newPrimary = rst.getPrimary();
+ assert.neq(oldPrimary, newPrimary);
+ const newPrimarySession = newPrimary.startSession(sessionOptions);
+ // Force the new session to use the old session id to simulate driver's behavior.
+ const overridenSessionId = newPrimarySession._serverSession.handle.getId();
+ const lsid = session._serverSession.handle.getId();
+ jsTest.log("Overriding sessionID " + tojson(overridenSessionId) + " with " + tojson(lsid));
+ newPrimarySession._serverSession.handle.getId = () => lsid;
+ const newPrimarySessionDb = newPrimarySession.getDatabase(dbName);
+
+ res = newPrimarySessionDb.adminCommand({
+ commitTransaction: 1,
+ txnNumber: NumberLong(txnNumber),
+ autocommit: false,
+ writeConcern: writeConcernMajority,
+ });
+ checkWriteConcernTimedOut(res);
+ assert.commandFailedWithCode(res, ErrorCodes.NoSuchTransaction);
+
+ jsTest.log("NoSuchTransaction with write concern error is not transient");
+ assert(!res.hasOwnProperty("errorLabels"));
+
+ jsTest.log("NoSuchTransaction without write concern error is transient");
+ restartServerReplication(rst.nodes);
+ res = newPrimarySessionDb.adminCommand({
+ commitTransaction: 1,
+ txnNumber: NumberLong(txnNumber),
+ autocommit: false,
+ writeConcern: {w: "majority"}, // Wait with a long timeout.
+ });
+ assert.commandFailedWithCode(res, ErrorCodes.NoSuchTransaction);
+ assert(!res.hasOwnProperty("writeConcernError"), res);
+ assert.eq(res["errorLabels"], ["TransientTransactionError"], res);
+
+ rst.awaitNodesAgreeOnPrimary();
+ session.endSession();
+
+ rst.stopSet();
+}());
diff --git a/src/mongo/db/service_entry_point_common.cpp b/src/mongo/db/service_entry_point_common.cpp
index c4180d864e6..ff715c7f72e 100644
--- a/src/mongo/db/service_entry_point_common.cpp
+++ b/src/mongo/db/service_entry_point_common.cpp
@@ -221,20 +221,30 @@ void generateErrorResponse(OperationContext* opCtx,
BSONObj getErrorLabels(const OperationSessionInfoFromClient& sessionOptions,
const std::string& commandName,
- ErrorCodes::Error code) {
+ ErrorCodes::Error code,
+ bool hasWriteConcernError) {
+
// By specifying "autocommit", the user indicates they want to run a transaction.
if (!sessionOptions.getAutocommit()) {
return {};
}
- bool isRetryable = ErrorCodes::isNotMasterError(code) || ErrorCodes::isShutdownError(code);
+ // The errors that indicate the transaction fails without any persistent side-effect.
bool isTransientTransactionError = code == ErrorCodes::WriteConflict //
|| code == ErrorCodes::SnapshotUnavailable //
- || code == ErrorCodes::NoSuchTransaction //
- || code == ErrorCodes::LockTimeout //
- // Clients can retry a single commitTransaction command, but cannot retry the whole
- // transaction if commitTransaction fails due to NotMaster.
- || (isRetryable && (commandName != "commitTransaction"));
+ || code == ErrorCodes::LockTimeout;
+
+ if (commandName == "commitTransaction") {
+ // NoSuchTransaction is determined based on the data. It's safe to retry the whole
+ // transaction, only if the data cannot be rolled back.
+ isTransientTransactionError |=
+ code == ErrorCodes::NoSuchTransaction && !hasWriteConcernError;
+ } else {
+ bool isRetryable = ErrorCodes::isNotMasterError(code) || ErrorCodes::isShutdownError(code);
+ // For commands other than "commitTransaction", we know there's no side-effect for these
+ // errors, but it's not true for "commitTransaction" if a failover happens.
+ isTransientTransactionError |= isRetryable || code == ErrorCodes::NoSuchTransaction;
+ }
if (isTransientTransactionError) {
return BSON("errorLabels" << BSON_ARRAY("TransientTransactionError"));
@@ -467,7 +477,7 @@ void invokeInTransaction(OperationContext* opCtx,
CommandInvocation* invocation,
const OpMsgRequest& request,
const OperationSessionInfoFromClient& sessionOptions,
- CommandReplyBuilder* replyBuilder) {
+ CommandReplyBuilder* replyBuilder) try {
auto session = OperationContextSession::get(opCtx);
invariant(session);
invariant(opCtx->getTxnNumber() || opCtx->getClient()->isInDirectClient());
@@ -495,6 +505,15 @@ void invokeInTransaction(OperationContext* opCtx,
// Stash or commit the transaction when the command succeeds.
session->stashTransactionResources(opCtx);
guard.Dismiss();
+} catch (const ExceptionFor<ErrorCodes::NoSuchTransaction>&) {
+ // We make our decision about the transaction state based on the oplog we have, so
+ // we set the client last op to the last optime observed by the system to ensure that
+ // we wait for the specified write concern on an optime greater than or equal to the
+ // the optime of our decision basis. Thus we know our decision basis won't be rolled
+ // back.
+ auto& replClient = repl::ReplClientInfo::forClient(opCtx->getClient());
+ replClient.setLastOpToSystemLastOpTime(opCtx);
+ throw;
}
bool runCommandImpl(OperationContext* opCtx,
@@ -590,7 +609,9 @@ bool runCommandImpl(OperationContext* opCtx,
if (codeField.isNumber()) {
auto code = ErrorCodes::Error(codeField.numberInt());
// Append the error labels for transient transaction errors.
- auto errorLabels = getErrorLabels(sessionOptions, command->getName(), code);
+ const auto hasWriteConcern = response.hasField("writeConcernError");
+ auto errorLabels =
+ getErrorLabels(sessionOptions, command->getName(), code, hasWriteConcern);
crb.getBodyBuilder().appendElements(errorLabels);
}
}
@@ -911,7 +932,10 @@ void execCommandDatabase(OperationContext* opCtx,
behaviors.handleException(e, opCtx);
// Append the error labels for transient transaction errors.
- auto errorLabels = getErrorLabels(sessionOptions, command->getName(), e.code());
+ auto response = extraFieldsBuilder.asTempObj();
+ auto hasWriteConcern = response.hasField("writeConcernError");
+ auto errorLabels =
+ getErrorLabels(sessionOptions, command->getName(), e.code(), hasWriteConcern);
extraFieldsBuilder.appendElements(errorLabels);
BSONObjBuilder metadataBob;