summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBilly Donahue <billy.donahue@mongodb.com>2021-05-10 18:01:22 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-06-08 21:39:25 +0000
commit357923ed77b477539e65d0b8a4ec5cae0e11746a (patch)
tree5365dfb853c5df2a99193148bcc025ff91105030
parente44dd9b6b1cd1636406f968fdd4f5bf71e5aa0c3 (diff)
downloadmongo-357923ed77b477539e65d0b8a4ec5cae0e11746a.tar.gz
SERVER-50549 transform state-changing errors returned by mongos
(cherry picked from commit 4a311668ac834b6e363a44c9f00cad9d4790288f)
-rw-r--r--jstests/sharding/change_stream_error_label.js7
-rw-r--r--jstests/sharding/linearizable_read_concern.js2
-rw-r--r--jstests/sharding/mongos_not_retry_commands_in_transactions.js19
-rw-r--r--jstests/sharding/retryable_write_error_labels.js9
-rw-r--r--jstests/sharding/transactions_writes_not_retryable.js2
-rw-r--r--src/mongo/db/SConscript1
-rw-r--r--src/mongo/db/commands.cpp8
-rw-r--r--src/mongo/db/commands/SConscript1
-rw-r--r--src/mongo/db/commands/getmore_cmd.cpp8
-rw-r--r--src/mongo/rpc/SConscript19
-rw-r--r--src/mongo/rpc/rewrite_state_change_errors.cpp235
-rw-r--r--src/mongo/rpc/rewrite_state_change_errors.h95
-rw-r--r--src/mongo/rpc/rewrite_state_change_errors_server_parameter.idl40
-rw-r--r--src/mongo/rpc/rewrite_state_change_errors_test.cpp181
-rw-r--r--src/mongo/s/commands/SConscript1
-rw-r--r--src/mongo/s/commands/cluster_hello_cmd.cpp4
-rw-r--r--src/mongo/s/commands/strategy.cpp6
-rw-r--r--src/mongo/shell/error_codes.tpl.js35
18 files changed, 658 insertions, 15 deletions
diff --git a/jstests/sharding/change_stream_error_label.js b/jstests/sharding/change_stream_error_label.js
index c393b6254bc..c62b35743bd 100644
--- a/jstests/sharding/change_stream_error_label.js
+++ b/jstests/sharding/change_stream_error_label.js
@@ -41,7 +41,8 @@ const expectedStopShardErrors = [
// First, verify that the 'failGetMoreAfterCursorCheckout' failpoint can effectively exercise the
// error label generation logic for change stream getMores.
-function testFailGetMoreAfterCursorCheckoutFailpoint({errorCode, expectedLabel}) {
+function testFailGetMoreAfterCursorCheckoutFailpoint({mongos, errorCode, expectedLabel}) {
+ errorCode = ErrorCodes.doMongosRewrite(mongos, errorCode);
// Activate the failpoint and set the exception that it will throw.
assert.commandWorked(testDB.adminCommand({
configureFailPoint: "failGetMoreAfterCursorCheckout",
@@ -69,9 +70,9 @@ function testFailGetMoreAfterCursorCheckoutFailpoint({errorCode, expectedLabel})
}
// Test the expected output for both resumable and non-resumable error codes.
testFailGetMoreAfterCursorCheckoutFailpoint(
- {errorCode: ErrorCodes.ShutdownInProgress, expectedLabel: true});
+ {mongos: st.s, errorCode: ErrorCodes.ShutdownInProgress, expectedLabel: true});
testFailGetMoreAfterCursorCheckoutFailpoint(
- {errorCode: ErrorCodes.FailedToParse, expectedLabel: false});
+ {mongos: st.s, errorCode: ErrorCodes.FailedToParse, expectedLabel: false});
// Now test both aggregate and getMore under conditions of an actual cluster outage. Shard the
// collection on shard0, split at {_id: 0}, and move the upper chunk to the other shard.
diff --git a/jstests/sharding/linearizable_read_concern.js b/jstests/sharding/linearizable_read_concern.js
index 0ac630639a4..a5720681aeb 100644
--- a/jstests/sharding/linearizable_read_concern.js
+++ b/jstests/sharding/linearizable_read_concern.js
@@ -85,7 +85,7 @@ var res = assert.commandFailed(testDB.runReadCommand({
readConcern: {level: "linearizable"},
maxTimeMS: shard0ReplTest.kDefaultTimeoutMS
}));
-assert.eq(res.code, ErrorCodes.NotWritablePrimary);
+assert.eq(res.code, ErrorCodes.doMongosRewrite(st.s, ErrorCodes.NotWritablePrimary));
jsTestLog("Testing linearizable read from primaries.");
diff --git a/jstests/sharding/mongos_not_retry_commands_in_transactions.js b/jstests/sharding/mongos_not_retry_commands_in_transactions.js
index 60e74860f5e..98a7f08f7ad 100644
--- a/jstests/sharding/mongos_not_retry_commands_in_transactions.js
+++ b/jstests/sharding/mongos_not_retry_commands_in_transactions.js
@@ -54,15 +54,16 @@ jsTest.log(
"Testing that mongos doesn't retry the read command with startTransaction=true on replication set failover.");
assert.commandWorked(setCommandToFail(primaryConnection, "find", kNs));
-assert.commandFailedWithCode(mongosDB.runCommand({
- find: kCollName,
- filter: kDoc0,
- startTransaction: true,
- txnNumber: NumberLong(transactionNumber++),
- stmtId: NumberInt(0),
- autocommit: false
-}),
- ErrorCodes.InterruptedDueToReplStateChange);
+assert.commandFailedWithCode(
+ mongosDB.runCommand({
+ find: kCollName,
+ filter: kDoc0,
+ startTransaction: true,
+ txnNumber: NumberLong(transactionNumber++),
+ stmtId: NumberInt(0),
+ autocommit: false
+ }),
+ ErrorCodes.doMongosRewrite(st.s0, ErrorCodes.InterruptedDueToReplStateChange));
jsTest.log("Testing that mongos retries retryable writes on failover.");
assert.commandWorked(setCommandToFail(primaryConnection, "insert", kNs));
diff --git a/jstests/sharding/retryable_write_error_labels.js b/jstests/sharding/retryable_write_error_labels.js
index 9f8d8a8be73..37bb0dbdf31 100644
--- a/jstests/sharding/retryable_write_error_labels.js
+++ b/jstests/sharding/retryable_write_error_labels.js
@@ -29,6 +29,15 @@ const st = new ShardingTest({
});
function checkErrorCode(res, expectedErrorCodes, isWCError) {
+ // Rewrite each element of the `expectedErrorCodes` array.
+ // If it's not an array, just rewrite the scalar.
+ var rewrite = ec => ErrorCodes.doMongosRewrite(st.s, ec);
+ if (Array.isArray(expectedErrorCodes)) {
+ expectedErrorCodes = expectedErrorCodes.map(rewrite);
+ } else {
+ expectedErrorCodes = rewrite(expectedErrorCodes);
+ }
+
if (isWCError) {
assert.neq(null, res.writeConcernError, res);
assert(anyEq([res.writeConcernError.code], expectedErrorCodes), res);
diff --git a/jstests/sharding/transactions_writes_not_retryable.js b/jstests/sharding/transactions_writes_not_retryable.js
index 462476c4fb7..7be7dd2b163 100644
--- a/jstests/sharding/transactions_writes_not_retryable.js
+++ b/jstests/sharding/transactions_writes_not_retryable.js
@@ -31,7 +31,7 @@ function runTest(st, session, sessionDB, writeCmdName, writeCmd, isSharded) {
session.startTransaction();
assert.commandFailedWithCode(
sessionDB.runCommand(writeCmd),
- retryableError,
+ ErrorCodes.doMongosRewrite(st.s, retryableError),
"expected write in transaction not to be retried on retryable error, cmd: " +
tojson(writeCmd) + ", sharded: " + isSharded);
assert.commandFailedWithCode(session.abortTransaction_forTesting(),
diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript
index 7f1253a8327..7db1148890f 100644
--- a/src/mongo/db/SConscript
+++ b/src/mongo/db/SConscript
@@ -601,6 +601,7 @@ env.Library(
'$BUILD_DIR/mongo/db/query/explain_options',
'$BUILD_DIR/mongo/idl/idl_parser',
'$BUILD_DIR/mongo/rpc/command_status',
+ '$BUILD_DIR/mongo/rpc/rewrite_state_change_errors',
'$BUILD_DIR/mongo/rpc/rpc',
'audit',
'auth/auth',
diff --git a/src/mongo/db/commands.cpp b/src/mongo/db/commands.cpp
index 7f7c5f39155..ed822c5bb81 100644
--- a/src/mongo/db/commands.cpp
+++ b/src/mongo/db/commands.cpp
@@ -61,6 +61,7 @@
#include "mongo/rpc/metadata/client_metadata.h"
#include "mongo/rpc/op_msg_rpc_impls.h"
#include "mongo/rpc/protocol.h"
+#include "mongo/rpc/rewrite_state_change_errors.h"
#include "mongo/rpc/write_concern_error_detail.h"
#include "mongo/s/stale_exception.h"
#include "mongo/util/fail_point.h"
@@ -681,6 +682,13 @@ void CommandHelpers::evaluateFailCommandFailPoint(OperationContext* opCtx,
const Command* cmd = invocation->definition();
failCommand.executeIf(
[&](const BSONObj& data) {
+ // State change codes are rewritten on the way out of a `mongos`
+ // server. Errors injected via failpoint manipulation are normally
+ // exempt from this. However, we provide an override option so they
+ // can be made subject to rewriting if that's really necessary.
+ if (bool b; !bsonExtractBooleanField(data, "allowRewriteStateChange", &b).isOK() || !b)
+ rpc::RewriteStateChangeErrors::setEnabled(opCtx, false);
+
if (data.hasField(kErrorLabelsFieldName) &&
data[kErrorLabelsFieldName].type() == Array) {
// Propagate error labels specified in the failCommand failpoint to the
diff --git a/src/mongo/db/commands/SConscript b/src/mongo/db/commands/SConscript
index 0b2f570ac1d..39a49adaab8 100644
--- a/src/mongo/db/commands/SConscript
+++ b/src/mongo/db/commands/SConscript
@@ -359,6 +359,7 @@ env.Library(
'$BUILD_DIR/mongo/db/views/views_mongod',
'$BUILD_DIR/mongo/executor/async_request_executor',
'$BUILD_DIR/mongo/idl/feature_flag',
+ '$BUILD_DIR/mongo/rpc/rewrite_state_change_errors',
'$BUILD_DIR/mongo/util/log_and_backoff',
'$BUILD_DIR/mongo/util/net/http_client',
'core',
diff --git a/src/mongo/db/commands/getmore_cmd.cpp b/src/mongo/db/commands/getmore_cmd.cpp
index 367182569f1..25c5162e69d 100644
--- a/src/mongo/db/commands/getmore_cmd.cpp
+++ b/src/mongo/db/commands/getmore_cmd.cpp
@@ -36,6 +36,7 @@
#include <memory>
#include <string>
+#include "mongo/bson/util/bson_extract.h"
#include "mongo/db/api_parameters.h"
#include "mongo/db/auth/authorization_checks.h"
#include "mongo/db/auth/authorization_session.h"
@@ -64,6 +65,7 @@
#include "mongo/db/stats/resource_consumption_metrics.h"
#include "mongo/db/stats/top.h"
#include "mongo/logv2/log.h"
+#include "mongo/rpc/rewrite_state_change_errors.h"
#include "mongo/s/chunk_version.h"
#include "mongo/util/fail_point.h"
#include "mongo/util/scopeguard.h"
@@ -569,7 +571,11 @@ public:
// If the 'failGetMoreAfterCursorCheckout' failpoint is enabled, throw an exception with
// the given 'errorCode' value, or ErrorCodes::InternalError if 'errorCode' is omitted.
failGetMoreAfterCursorCheckout.executeIf(
- [](const BSONObj& data) {
+ [&](const BSONObj& data) {
+ if (bool b;
+ !bsonExtractBooleanField(data, "allowRewriteStateChange", &b).isOK() || !b)
+ rpc::RewriteStateChangeErrors::setEnabled(opCtx, false);
+
auto errorCode = (data["errorCode"] ? data["errorCode"].safeNumberLong()
: ErrorCodes::InternalError);
uasserted(errorCode, "Hit the 'failGetMoreAfterCursorCheckout' failpoint");
diff --git a/src/mongo/rpc/SConscript b/src/mongo/rpc/SConscript
index c16d1e116c4..dd7d6205c84 100644
--- a/src/mongo/rpc/SConscript
+++ b/src/mongo/rpc/SConscript
@@ -74,6 +74,23 @@ env.Library(
],
LIBDEPS_PRIVATE=[
'$BUILD_DIR/mongo/idl/server_parameter',
+ 'rewrite_state_change_errors',
+ ],
+)
+
+env.Library(
+ target='rewrite_state_change_errors',
+ source=[
+ 'rewrite_state_change_errors.cpp',
+ 'rewrite_state_change_errors_server_parameter.idl',
+ ],
+ LIBDEPS_PRIVATE=[
+ '$BUILD_DIR/mongo/base',
+ '$BUILD_DIR/mongo/bson/mutable/mutable_bson',
+ '$BUILD_DIR/mongo/db/service_context',
+ '$BUILD_DIR/mongo/rpc/protocol',
+ '$BUILD_DIR/mongo/s/is_mongos',
+ '$BUILD_DIR/third_party/shim_pcrecpp',
],
)
@@ -170,12 +187,14 @@ if wiredtiger:
'op_msg_test.cpp',
'protocol_test.cpp',
'reply_builder_test.cpp',
+ 'rewrite_state_change_errors_test.cpp',
],
LIBDEPS=[
'$BUILD_DIR/mongo/client/clientdriver_minimal',
'$BUILD_DIR/third_party/wiredtiger/wiredtiger_checksum',
'client_metadata',
'metadata',
+ 'rewrite_state_change_errors',
'rpc',
]
)
diff --git a/src/mongo/rpc/rewrite_state_change_errors.cpp b/src/mongo/rpc/rewrite_state_change_errors.cpp
new file mode 100644
index 00000000000..9f0fea0c294
--- /dev/null
+++ b/src/mongo/rpc/rewrite_state_change_errors.cpp
@@ -0,0 +1,235 @@
+/**
+ * Copyright (C) 2021-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kNetwork
+
+#include "mongo/rpc/rewrite_state_change_errors.h"
+
+#include "mongo/platform/basic.h"
+
+#include <array>
+#include <string>
+
+#include <boost/optional.hpp>
+#include <fmt/format.h>
+#include <pcrecpp.h>
+
+#include "mongo/bson/mutable/document.h"
+#include "mongo/bson/mutable/element.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/db/service_context.h"
+#include "mongo/logv2/log.h"
+#include "mongo/platform/atomic_word.h"
+#include "mongo/rpc/message.h"
+#include "mongo/rpc/op_msg.h"
+#include "mongo/rpc/rewrite_state_change_errors_server_parameter_gen.h"
+#include "mongo/s/is_mongos.h"
+#include "mongo/util/assert_util.h"
+#include "mongo/util/static_immortal.h"
+
+namespace mongo::rpc {
+namespace {
+
+struct RewriteEnabled {
+ bool enabled = rewriteStateChangeErrors;
+};
+
+/** Enable for the entire service. */
+auto enabledForService = ServiceContext::declareDecoration<RewriteEnabled>();
+
+/** Enable for a single operation. */
+auto enabledForOperation = OperationContext::declareDecoration<RewriteEnabled>();
+
+
+/**
+ * We must replace key phrases in `errmsg` with sensible strings that
+ * drivers will ignore. Return scrubbed `val`, or no value if it
+ * doesn't need scrubbing.
+ *
+ * See
+ * https://github.com/mongodb/specifications/blob/master/source/server-discovery-and-monitoring/server-discovery-and-monitoring.rst#not-master-and-node-is-recovering
+ */
+boost::optional<std::string> scrubErrmsg(StringData val) {
+ struct Scrub {
+ pcrecpp::RE pat;
+ std::string sub;
+ };
+ static const StaticImmortal scrubs = std::array{
+ Scrub{pcrecpp::RE("not master"), "(NOT_PRIMARY)"},
+ Scrub{pcrecpp::RE("node is recovering"), "(NODE_IS_RECOVERING)"},
+ };
+ // Fast scan for the common case that no key phrase is present.
+ static const StaticImmortal fastScan = [] {
+ std::string pat;
+ StringData sep;
+ auto out = std::back_inserter(pat);
+ for (const auto& scrub : *scrubs) {
+ out = format_to(out, FMT_STRING("{}({})"), sep, scrub.pat.pattern());
+ sep = "|"_sd;
+ }
+ return pcrecpp::RE(pat);
+ }();
+
+ pcrecpp::StringPiece pcreVal(val.rawData(), val.size());
+
+ if (fastScan->PartialMatch(pcreVal)) {
+ std::string s{val};
+ bool didSub = false;
+ for (auto&& scrub : *scrubs) {
+ bool subOk = scrub.pat.GlobalReplace(scrub.sub, &s);
+ didSub = (didSub || subOk);
+ }
+ if (didSub)
+ return s;
+ }
+ return {};
+}
+
+/**
+ * Change the {code, codeName, errmsg} fields to cloak a proxied state change
+ * error. We choose `HostUnreachable` as it is retryable but doesn't induce an
+ * SDAM state change.
+ */
+void editErrorNode(mutablebson::Element&& node) {
+ if (auto codeElement = node["code"]; codeElement.ok()) {
+ static constexpr auto newCode = ErrorCodes::HostUnreachable;
+ uassertStatusOK(codeElement.setValueInt(newCode));
+ // If there's a corresponding `codeName`, replace it to match.
+ if (auto codeName = node["codeName"]; codeName.ok())
+ uassertStatusOK(codeName.setValueString(ErrorCodes::errorString(newCode)));
+ }
+ if (auto errmsg = node["errmsg"]; errmsg.ok() && errmsg.isType(String))
+ if (auto scrubbed = scrubErrmsg(errmsg.getValueString()))
+ uassertStatusOK(errmsg.setValueString(*scrubbed));
+}
+
+/**
+ * If `node` contains a numeric "code" field that indicates a state change,
+ * returns the value of that code field as an `ErrorCodes::Error`. Otherwise
+ * returns a disengaged optional.
+ *
+ * There are two categories of state change errors: ShutdownError and
+ * NotPrimaryError.
+ */
+boost::optional<ErrorCodes::Error> needsRewrite(ServiceContext* sc, const BSONObj& node) {
+ int32_t intCode{};
+ if (!node["code"].coerce(&intCode))
+ return {};
+ ErrorCodes::Error ec{intCode};
+
+ if (ErrorCodes::isA<ErrorCategory::NotPrimaryError>(ec)) {
+ return ec;
+ }
+
+ // ShutdownError codes are correct if this server is also in shutdown. If
+ // this server is shutting down, then even if the shutdown error didn't
+ // originate from this server, it might as well have.
+ if (ErrorCodes::isA<ErrorCategory::ShutdownError>(ec) && !sc->getKillAllOperations()) {
+ return ec;
+ }
+
+ return {};
+}
+
+/**
+ * Returns a copy of doc with errors rewritten to cloak state change errors if
+ * necessary. Returns disengaged optional if no changes were necessary.
+ */
+boost::optional<BSONObj> rewriteDocument(const BSONObj& doc, OperationContext* opCtx) {
+ boost::optional<mutablebson::Document> mutableDoc;
+ auto lazyMutableRoot = [&] {
+ if (!mutableDoc)
+ mutableDoc.emplace(doc);
+ return mutableDoc->root();
+ };
+
+ ServiceContext* sc = opCtx->getServiceContext();
+
+ // Skip unless there's an "ok" element value equivalent to expected 0 or 1.
+ // "ok" is conventionally a NumberDouble, but coerce since this is unspecified.
+ double okValue = 0;
+ if (!doc.getField("ok").coerce(&okValue) || (okValue != 0 && okValue != 1))
+ return {}; // Skip: missing or unusable "ok" field.
+
+ boost::optional<ErrorCodes::Error> oldCode;
+
+ // The root of the doc is an error-bearing node if not ok.
+ if (okValue == 0 && (oldCode = needsRewrite(sc, doc)))
+ editErrorNode(lazyMutableRoot());
+
+ // The `writeErrors` and `writeConcernError` nodes might need editing.
+ // `writeErrors` is an array of error-bearing nodes like the doc root.
+ if (const auto& we = doc["writeErrors"]; we.type() == Array) {
+ size_t idx = 0;
+ BSONObj bArr = we.Obj();
+ for (auto ai = bArr.begin(); ai != bArr.end(); ++ai, ++idx)
+ if (ai->type() == Object && (oldCode = needsRewrite(sc, ai->Obj())))
+ editErrorNode(lazyMutableRoot()["writeErrors"][idx]);
+ }
+
+ // `writeConcernError` is a single error-bearing node.
+ if (const auto& wce = doc["writeConcernError"]; wce.type() == Object) {
+ if ((oldCode = needsRewrite(sc, wce.Obj())))
+ editErrorNode(lazyMutableRoot()["writeConcernError"]);
+ }
+
+ if (mutableDoc) {
+ LOGV2_DEBUG(1, 5054900, "Rewrote state change error", "code"_attr = oldCode);
+ return mutableDoc->getObject();
+ }
+ return {};
+}
+
+} // namespace
+
+
+bool RewriteStateChangeErrors::getEnabled(OperationContext* opCtx) {
+ return enabledForOperation(opCtx).enabled;
+}
+
+void RewriteStateChangeErrors::setEnabled(OperationContext* opCtx, bool e) {
+ enabledForOperation(opCtx).enabled = e;
+}
+
+bool RewriteStateChangeErrors::getEnabled(ServiceContext* sc) {
+ return enabledForService(sc).enabled;
+}
+
+void RewriteStateChangeErrors::setEnabled(ServiceContext* sc, bool e) {
+ enabledForService(sc).enabled = e;
+}
+
+boost::optional<BSONObj> RewriteStateChangeErrors::rewrite(BSONObj doc, OperationContext* opCtx) {
+ auto sc = opCtx->getServiceContext();
+ if (!isMongos() || (sc && !getEnabled(sc)) || !getEnabled(opCtx))
+ return {};
+ return rewriteDocument(doc, opCtx);
+}
+
+} // namespace mongo::rpc
diff --git a/src/mongo/rpc/rewrite_state_change_errors.h b/src/mongo/rpc/rewrite_state_change_errors.h
new file mode 100644
index 00000000000..9edccd7e999
--- /dev/null
+++ b/src/mongo/rpc/rewrite_state_change_errors.h
@@ -0,0 +1,95 @@
+/**
+ * Copyright (C) 2021-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include <boost/optional.hpp>
+
+#include "mongo/db/operation_context.h"
+#include "mongo/db/service_context.h"
+#include "mongo/rpc/message.h"
+
+namespace mongo::rpc {
+
+class RewriteStateChangeErrors {
+public:
+ /**
+ * Enable/disable for an entire server.
+ * The default is determined by server parameter "rewriteStateChangeErrors".
+ */
+ static bool getEnabled(ServiceContext* sc);
+ static void setEnabled(ServiceContext* sc, bool e);
+
+ /** Enable/disable only for a single operation. */
+ static bool getEnabled(OperationContext* opCtx);
+ static void setEnabled(OperationContext* opCtx, bool e);
+
+ /**
+ * Transforms an outgoing message to conditionally mask "state change" errors,
+ * which are errors that cause a client to change its connection to the host
+ * that sent it, such as marking it "Unknown". A shutdown error that's emitted
+ * by a proxy server (e.g. mongos) can be misinterpreted by a naive client to
+ * be indicative of the proxy server shutting down. So this simple rewrite
+ * scheme attempts to mask those errors on the way out.
+ *
+ * Return a disengaged optional if no rewrite is needed.
+ * Otherwise, returns a copy of doc with the errors rewritten.
+ *
+ * Error-bearing subobjects are places in the document where a `code` element
+ * can indicate an error status. When rewriting, any error-bearing subobjects
+ * are examined for a `code` element.
+ *
+ * In an error document (with "ok" mapped to 0.0), only the root element is
+ * error-bearing. In a success document (where "ok" is mapped to 1.0), the
+ * `writeConcernError` subobject and all the subobjects in the `writeErrors`
+ * BSON array are error-bearing subobjects.
+ *
+ * Rewriting occurs only if all of these conditions are met:
+ *
+ * - This feature wasn't deselected with a `setEnabled` call, either on the
+ * `opCtx` or for more widely for its associated ServiceContext.
+ * This will happen with failpoint-induced errors or with "hello" commands,
+ * both of which are exempt from state change error rewriting.
+ *
+ * - doc contains no error nodes that need rewriting.
+ * - NotPrimaryError codes always need rewriting.
+ * - ShutdownError codes need rewriting unless this server is
+ * shutting down.
+ *
+ * Any rewritten `code` is replaced by `HostUnreachable`, and associated
+ * `codeName` replaced to be consistent with the new code. Additionally, the
+ * `errmsg` in the subobject has all occurrences of the key phrases "not
+ * master" and "node is recovering" replaced with "(NOT_PRIMARY)" and
+ * "(NODE_IS_RECOVERING)" respectively so that client state changes based on
+ * the presence of these legacy strings are suppressed.
+ */
+ static boost::optional<BSONObj> rewrite(BSONObj doc, OperationContext* opCtx);
+};
+
+} // namespace mongo::rpc
diff --git a/src/mongo/rpc/rewrite_state_change_errors_server_parameter.idl b/src/mongo/rpc/rewrite_state_change_errors_server_parameter.idl
new file mode 100644
index 00000000000..cdb8fe44669
--- /dev/null
+++ b/src/mongo/rpc/rewrite_state_change_errors_server_parameter.idl
@@ -0,0 +1,40 @@
+# Copyright (C) 2021-present MongoDB, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the Server Side Public License, version 1,
+# as published by MongoDB, Inc.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# Server Side Public License for more details.
+#
+# You should have received a copy of the Server Side Public License
+# along with this program. If not, see
+# <http://www.mongodb.com/licensing/server-side-public-license>.
+#
+# As a special exception, the copyright holders give permission to link the
+# code of portions of this program with the OpenSSL library under certain
+# conditions as described in each individual source file and distribute
+# linked combinations including the program with the OpenSSL library. You
+# must comply with the Server Side Public License in all respects for
+# all of the code used other than as permitted herein. If you modify file(s)
+# with this exception, you may extend this exception to your version of the
+# file(s), but you are not obligated to do so. If you do not wish to do so,
+# delete this exception statement from your version. If you delete this
+# exception statement from all source files in the program, then also delete
+# it in the license file.
+#
+
+global:
+ cpp_namespace: "mongo::rpc"
+
+server_parameters:
+ # Kill switch just in case this behavior is slow or breaks something.
+ rewriteStateChangeErrors:
+ description: >-
+ A failsafe to disable mongos state change error rewriting.
+ set_at: startup
+ cpp_vartype: bool
+ cpp_varname: rewriteStateChangeErrors
+ default: true
diff --git a/src/mongo/rpc/rewrite_state_change_errors_test.cpp b/src/mongo/rpc/rewrite_state_change_errors_test.cpp
new file mode 100644
index 00000000000..66cda0cd0c2
--- /dev/null
+++ b/src/mongo/rpc/rewrite_state_change_errors_test.cpp
@@ -0,0 +1,181 @@
+/**
+ * Copyright (C) 2021-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/rpc/rewrite_state_change_errors.h"
+
+#include "mongo/base/error_codes.h"
+#include "mongo/bson/bsonobj.h"
+#include "mongo/bson/bsonobjbuilder.h"
+#include "mongo/db/service_context.h"
+#include "mongo/rpc/message.h"
+#include "mongo/rpc/op_msg.h"
+#include "mongo/s/is_mongos.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo::rpc {
+namespace {
+
+class RewriteStateChangeErrorsTest : public unittest::Test {
+public:
+ void setUp() override {
+ _savedIsMongos = isMongos();
+ setMongos(true); // whole feature only happens on mongos
+ RewriteStateChangeErrors::setEnabled(&*sc, true);
+ }
+
+ void tearDown() override {
+ setMongos(_savedIsMongos);
+ }
+
+ /** Run rewrite on `obj` and return what it was remapped to if anything. */
+ BSONObj rewriteObj(const BSONObj& obj) {
+ if (auto newDoc = RewriteStateChangeErrors::rewrite(obj, &*opCtx))
+ return *newDoc;
+ return obj;
+ }
+
+ /** Make an error node corresponding to `ec`. */
+ BSONObj errorObject(ErrorCodes::Error ec) {
+ BSONObjBuilder bob;
+ if (ec == ErrorCodes::OK)
+ bob.append("ok", 1.);
+ else
+ bob.append("ok", 0.)
+ .append("code", static_cast<int>(ec))
+ .append("codeName", ErrorCodes::errorString(ec));
+ return bob.obj();
+ }
+
+ /** Make an error node corresponding to `ec` with `errmsg`. */
+ BSONObj errorObject(ErrorCodes::Error ec, std::string errmsg) {
+ return BSONObjBuilder(errorObject(ec)).append("errmsg", errmsg).obj();
+ }
+
+ /** A few codes and what we expect them to be rewritten to. */
+ struct InOutCode {
+ ErrorCodes::Error in;
+ ErrorCodes::Error out;
+ };
+ static constexpr InOutCode errorCodeScenarios[] = {
+ {ErrorCodes::InterruptedAtShutdown, ErrorCodes::HostUnreachable},
+ {ErrorCodes::ShutdownInProgress, ErrorCodes::HostUnreachable},
+ {ErrorCodes::OK, ErrorCodes::OK},
+ {ErrorCodes::BadValue, ErrorCodes::BadValue},
+ };
+
+ ServiceContext::UniqueServiceContext sc = ServiceContext::make();
+ ServiceContext::UniqueClient cc = sc->makeClient("test", nullptr);
+ ServiceContext::UniqueOperationContext opCtx = sc->makeOperationContext(cc.get());
+
+private:
+ bool _savedIsMongos;
+};
+
+// Rewrite Shutdown errors received from proxied commands.
+TEST_F(RewriteStateChangeErrorsTest, Enabled) {
+ for (auto&& [in, out] : errorCodeScenarios) {
+ ASSERT_BSONOBJ_EQ(rewriteObj(errorObject(in)), errorObject(out));
+ }
+}
+
+// Check that rewrite behavior can be disabled per-ServiceContext.
+TEST_F(RewriteStateChangeErrorsTest, Disabled) {
+ RewriteStateChangeErrors::setEnabled(&*sc, false);
+ ASSERT_BSONOBJ_EQ(rewriteObj(errorObject(ErrorCodes::InterruptedAtShutdown)),
+ errorObject(ErrorCodes::InterruptedAtShutdown));
+}
+
+// Check that rewrite behavior can be disabled per-opCtx.
+TEST_F(RewriteStateChangeErrorsTest, DisabledOpCtx) {
+ RewriteStateChangeErrors::setEnabled(&*opCtx, false);
+ ASSERT_BSONOBJ_EQ(rewriteObj(errorObject(ErrorCodes::InterruptedAtShutdown)),
+ errorObject(ErrorCodes::InterruptedAtShutdown));
+}
+
+// If locally shutting down, then shutdown errors must not be rewritten.
+TEST_F(RewriteStateChangeErrorsTest, LocalShutdown) {
+ sc->setKillAllOperations();
+ ASSERT_BSONOBJ_EQ(rewriteObj(errorObject(ErrorCodes::InterruptedAtShutdown)),
+ errorObject(ErrorCodes::InterruptedAtShutdown));
+}
+
+TEST_F(RewriteStateChangeErrorsTest, RewriteErrmsg) {
+ const std::pair<std::string, std::string> scenarios[] = {
+ {"not master", "(NOT_PRIMARY)"},
+ {"node is recovering", "(NODE_IS_RECOVERING)"},
+ {"NOT master", "NOT master"},
+ {"", ""},
+ {" not masternot master ", " (NOT_PRIMARY)(NOT_PRIMARY) "},
+ {"not masternode is recovering", "(NOT_PRIMARY)(NODE_IS_RECOVERING)"},
+ };
+ for (auto&& io : scenarios) {
+ ASSERT_BSONOBJ_EQ(rewriteObj(errorObject(ErrorCodes::InterruptedAtShutdown, io.first)),
+ errorObject(ErrorCodes::HostUnreachable, io.second));
+ }
+}
+
+// Can find and rewrite the `writeConcernError` in an `ok:1` response.
+TEST_F(RewriteStateChangeErrorsTest, WriteConcernError) {
+ // Make an OK object, and append a `writeConcernError` subobject bearing the `ec` error.
+ auto wceObject = [&](ErrorCodes::Error ec) {
+ return BSONObjBuilder(errorObject(ErrorCodes::OK))
+ .append("writeConcernError", errorObject(ec))
+ .obj();
+ };
+ for (auto&& [in, out] : errorCodeScenarios) {
+ ASSERT_BSONOBJ_EQ(rewriteObj(wceObject(in)), wceObject(out));
+ }
+}
+
+// Can find and rewrite the `writeErrors` array elements in an `ok:1` response.
+TEST_F(RewriteStateChangeErrorsTest, WriteErrors) {
+ // Make an OK object, and append a `writeErrors` subobject bearing the `ec` errors.
+ auto weObject = [&](std::vector<ErrorCodes::Error> ecVec) {
+ BSONObjBuilder bob(errorObject(ErrorCodes::OK));
+ {
+ BSONArrayBuilder bab(bob.subarrayStart("writeErrors"));
+ for (ErrorCodes::Error ec : ecVec)
+ bab.append(errorObject(ec));
+ }
+ return bob.obj();
+ };
+ for (auto&& [in, out] : errorCodeScenarios) {
+ ASSERT_BSONOBJ_EQ(rewriteObj(weObject({in})), weObject({out}));
+ }
+ // Now try all the errorCodeScenarios as a single array of `writeErrors`.
+ std::vector<ErrorCodes::Error> allIn, allOut;
+ for (auto&& [in, out] : errorCodeScenarios) {
+ allIn.push_back(in);
+ allOut.push_back(out);
+ }
+ ASSERT_BSONOBJ_EQ(rewriteObj(weObject(allIn)), weObject(allOut));
+}
+
+} // namespace
+} // namespace mongo::rpc
diff --git a/src/mongo/s/commands/SConscript b/src/mongo/s/commands/SConscript
index d56a8ae52d3..4bc7bfc06b9 100644
--- a/src/mongo/s/commands/SConscript
+++ b/src/mongo/s/commands/SConscript
@@ -150,6 +150,7 @@ env.Library(
'$BUILD_DIR/mongo/executor/async_request_executor',
'$BUILD_DIR/mongo/idl/server_parameter',
'$BUILD_DIR/mongo/rpc/client_metadata',
+ '$BUILD_DIR/mongo/rpc/rewrite_state_change_errors',
'$BUILD_DIR/mongo/s/cluster_last_error_info',
'$BUILD_DIR/mongo/s/mongos_topology_coordinator',
'$BUILD_DIR/mongo/s/query/cluster_aggregate',
diff --git a/src/mongo/s/commands/cluster_hello_cmd.cpp b/src/mongo/s/commands/cluster_hello_cmd.cpp
index cdb05cf2c9d..a2b404b1513 100644
--- a/src/mongo/s/commands/cluster_hello_cmd.cpp
+++ b/src/mongo/s/commands/cluster_hello_cmd.cpp
@@ -45,6 +45,7 @@
#include "mongo/db/wire_version.h"
#include "mongo/logv2/log.h"
#include "mongo/rpc/metadata/client_metadata.h"
+#include "mongo/rpc/rewrite_state_change_errors.h"
#include "mongo/rpc/topology_version_gen.h"
#include "mongo/s/mongos_topology_coordinator.h"
#include "mongo/transport/message_compressor_manager.h"
@@ -120,6 +121,9 @@ public:
waitInHello.pauseWhileSet(opCtx);
+ // "hello" is exempt from error code rewrites.
+ rpc::RewriteStateChangeErrors::setEnabled(opCtx, false);
+
auto client = opCtx->getClient();
if (ClientMetadata::tryFinalize(client)) {
audit::logClientMetadata(client);
diff --git a/src/mongo/s/commands/strategy.cpp b/src/mongo/s/commands/strategy.cpp
index 72848f671e9..a50c52b77f4 100644
--- a/src/mongo/s/commands/strategy.cpp
+++ b/src/mongo/s/commands/strategy.cpp
@@ -74,6 +74,7 @@
#include "mongo/rpc/metadata/tracking_metadata.h"
#include "mongo/rpc/op_msg.h"
#include "mongo/rpc/op_msg_rpc_impls.h"
+#include "mongo/rpc/rewrite_state_change_errors.h"
#include "mongo/s/catalog_cache.h"
#include "mongo/s/client/shard_registry.h"
#include "mongo/s/cluster_commands_helpers.h"
@@ -1357,6 +1358,11 @@ DbResponse ClientCommand::_produceResponse() {
dbResponse.nextInvocation = reply->getNextInvocation();
}
}
+ if (auto doc = rpc::RewriteStateChangeErrors::rewrite(reply->getBodyBuilder().asTempObj(),
+ _rec->getOpCtx())) {
+ reply->reset();
+ reply->getBodyBuilder().appendElements(*doc);
+ }
dbResponse.response = reply->done();
return dbResponse;
diff --git a/src/mongo/shell/error_codes.tpl.js b/src/mongo/shell/error_codes.tpl.js
index f4299deae3f..21f9143e2f7 100644
--- a/src/mongo/shell/error_codes.tpl.js
+++ b/src/mongo/shell/error_codes.tpl.js
@@ -81,3 +81,38 @@ ErrorCodes.is${cat.name} = function(err) {
}
};
//#end for
+
+/**
+ * Returns true if the mongos `s` is expected to rewrite state change errors.
+ * This is determined by a `getParameter` of the "rewriteStateChangeErrors" option.
+ * Failure would indicate a mongos that doesn't support the rewrite feature.
+ */
+ErrorCodes.probeMongosRewrite = function(s) {
+ const param = "rewriteStateChangeErrors";
+ const res = s.adminCommand({getParameter: 1, [param]: 1});
+ if (!(param in res)) {
+ print("Mongos did not return a " + param + " server parameter: " + tojson(res));
+ return false;
+ }
+ return res[param];
+};
+
+/**
+ * Returns the ErrorCode to which the specified mongos `s` would remap the
+ * specified `err`. Mongos normally rewrites connection state change errors,
+ * unless it is shutting down or the code was injected by a mongos failpoint.
+ *
+ * The optional `doesRewrite` bool parameter provides a mechanism to bypass the
+ * probe, which may be useful if the probe would interfere with a test's
+ * operation.
+ */
+ErrorCodes.doMongosRewrite = function(s, err, doesRewrite) {
+ if (doesRewrite === undefined)
+ doesRewrite = ErrorCodes.probeMongosRewrite(s);
+ if (doesRewrite) {
+ if (ErrorCodes.isNotPrimaryError(err) || ErrorCodes.isShutdownError(err)) {
+ return ErrorCodes.HostUnreachable;
+ }
+ }
+ return err;
+};