diff options
author | Billy Donahue <billy.donahue@mongodb.com> | 2021-05-10 18:01:22 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-06-11 16:19:59 +0000 |
commit | 3547ce01d32a4200d00205672fdf9d4eb44b7953 (patch) | |
tree | 1d59bc7480c4d63516554313e19c68d70dc6eaa9 | |
parent | cbe262bb3d99708c5e59bf3c9a32c7e15ce78fe6 (diff) | |
download | mongo-3547ce01d32a4200d00205672fdf9d4eb44b7953.tar.gz |
SERVER-50549 transform state-changing errors returned by mongos
(cherry picked from commit 4a311668ac834b6e363a44c9f00cad9d4790288f)
-rw-r--r-- | jstests/sharding/change_stream_error_label.js | 7 | ||||
-rw-r--r-- | jstests/sharding/linearizable_read_concern.js | 2 | ||||
-rw-r--r-- | jstests/sharding/mongos_not_retry_commands_in_transactions.js | 19 | ||||
-rw-r--r-- | jstests/sharding/retryable_write_error_labels.js | 9 | ||||
-rw-r--r-- | jstests/sharding/transactions_writes_not_retryable.js | 2 | ||||
-rw-r--r-- | src/mongo/db/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/db/commands.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/commands/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/db/commands/getmore_cmd.cpp | 8 | ||||
-rw-r--r-- | src/mongo/rpc/SConscript | 19 | ||||
-rw-r--r-- | src/mongo/rpc/rewrite_state_change_errors.cpp | 235 | ||||
-rw-r--r-- | src/mongo/rpc/rewrite_state_change_errors.h | 95 | ||||
-rw-r--r-- | src/mongo/rpc/rewrite_state_change_errors_server_parameter.idl | 40 | ||||
-rw-r--r-- | src/mongo/rpc/rewrite_state_change_errors_test.cpp | 181 | ||||
-rw-r--r-- | src/mongo/s/commands/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/s/commands/strategy.cpp | 6 | ||||
-rw-r--r-- | src/mongo/shell/error_codes.tpl.js | 35 |
17 files changed, 654 insertions, 15 deletions
diff --git a/jstests/sharding/change_stream_error_label.js b/jstests/sharding/change_stream_error_label.js index 21c90f57b4d..2e1272343a4 100644 --- a/jstests/sharding/change_stream_error_label.js +++ b/jstests/sharding/change_stream_error_label.js @@ -37,7 +37,8 @@ const expectedStopShardErrors = [ // First, verify that the 'failGetMoreAfterCursorCheckout' failpoint can effectively exercise the // error label generation logic for change stream getMores. -function testFailGetMoreAfterCursorCheckoutFailpoint({errorCode, expectedLabel}) { +function testFailGetMoreAfterCursorCheckoutFailpoint({mongos, errorCode, expectedLabel}) { + errorCode = ErrorCodes.doMongosRewrite(mongos, errorCode); // Activate the failpoint and set the exception that it will throw. assert.commandWorked(testDB.adminCommand({ configureFailPoint: "failGetMoreAfterCursorCheckout", @@ -65,9 +66,9 @@ function testFailGetMoreAfterCursorCheckoutFailpoint({errorCode, expectedLabel}) } // Test the expected output for both resumable and non-resumable error codes. testFailGetMoreAfterCursorCheckoutFailpoint( - {errorCode: ErrorCodes.ShutdownInProgress, expectedLabel: true}); + {mongos: st.s, errorCode: ErrorCodes.ShutdownInProgress, expectedLabel: true}); testFailGetMoreAfterCursorCheckoutFailpoint( - {errorCode: ErrorCodes.FailedToParse, expectedLabel: false}); + {mongos: st.s, errorCode: ErrorCodes.FailedToParse, expectedLabel: false}); // Now test both aggregate and getMore under conditions of an actual cluster outage. Shard the // collection, split at {_id: 0}, and move the upper chunk to the other shard. diff --git a/jstests/sharding/linearizable_read_concern.js b/jstests/sharding/linearizable_read_concern.js index 0ac630639a4..a5720681aeb 100644 --- a/jstests/sharding/linearizable_read_concern.js +++ b/jstests/sharding/linearizable_read_concern.js @@ -85,7 +85,7 @@ var res = assert.commandFailed(testDB.runReadCommand({ readConcern: {level: "linearizable"}, maxTimeMS: shard0ReplTest.kDefaultTimeoutMS })); -assert.eq(res.code, ErrorCodes.NotWritablePrimary); +assert.eq(res.code, ErrorCodes.doMongosRewrite(st.s, ErrorCodes.NotWritablePrimary)); jsTestLog("Testing linearizable read from primaries."); diff --git a/jstests/sharding/mongos_not_retry_commands_in_transactions.js b/jstests/sharding/mongos_not_retry_commands_in_transactions.js index 44d6dcbf149..75965a92a7e 100644 --- a/jstests/sharding/mongos_not_retry_commands_in_transactions.js +++ b/jstests/sharding/mongos_not_retry_commands_in_transactions.js @@ -54,15 +54,16 @@ jsTest.log( "Testing that mongos doesn't retry the read command with startTransaction=true on replication set failover."); assert.commandWorked(setCommandToFail(primaryConnection, "find", kNs)); -assert.commandFailedWithCode(mongosDB.runCommand({ - find: kCollName, - filter: kDoc0, - startTransaction: true, - txnNumber: NumberLong(transactionNumber++), - stmtId: NumberInt(0), - autocommit: false -}), - ErrorCodes.InterruptedDueToReplStateChange); +assert.commandFailedWithCode( + mongosDB.runCommand({ + find: kCollName, + filter: kDoc0, + startTransaction: true, + txnNumber: NumberLong(transactionNumber++), + stmtId: NumberInt(0), + autocommit: false + }), + ErrorCodes.doMongosRewrite(st.s0, ErrorCodes.InterruptedDueToReplStateChange)); jsTest.log("Testing that mongos retries retryable writes on failover."); assert.commandWorked(setCommandToFail(primaryConnection, "insert", kNs)); diff --git a/jstests/sharding/retryable_write_error_labels.js b/jstests/sharding/retryable_write_error_labels.js index 368759f312d..e4ed4d93170 100644 --- a/jstests/sharding/retryable_write_error_labels.js +++ b/jstests/sharding/retryable_write_error_labels.js @@ -32,6 +32,15 @@ assert.commandWorked(primary.getDB(dbName).runCommand( {insert: collName, documents: [{_id: 0}], writeConcern: {w: "majority"}})); function checkErrorCode(res, expectedErrorCodes, isWCError) { + // Rewrite each element of the `expectedErrorCodes` array. + // If it's not an array, just rewrite the scalar. + var rewrite = ec => ErrorCodes.doMongosRewrite(st.s, ec); + if (Array.isArray(expectedErrorCodes)) { + expectedErrorCodes = expectedErrorCodes.map(rewrite); + } else { + expectedErrorCodes = rewrite(expectedErrorCodes); + } + if (isWCError) { assert.neq(null, res.writeConcernError, res); assert(anyEq([res.writeConcernError.code], expectedErrorCodes), res); diff --git a/jstests/sharding/transactions_writes_not_retryable.js b/jstests/sharding/transactions_writes_not_retryable.js index 7c33eab52cb..51fef80d5e2 100644 --- a/jstests/sharding/transactions_writes_not_retryable.js +++ b/jstests/sharding/transactions_writes_not_retryable.js @@ -31,7 +31,7 @@ function runTest(st, session, sessionDB, writeCmdName, writeCmd, isSharded) { session.startTransaction(); assert.commandFailedWithCode( sessionDB.runCommand(writeCmd), - retryableError, + ErrorCodes.doMongosRewrite(st.s, retryableError), "expected write in transaction not to be retried on retryable error, cmd: " + tojson(writeCmd) + ", sharded: " + isSharded); assert.commandFailedWithCode(session.abortTransaction_forTesting(), diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript index e67d900d53a..dc259456ff8 100644 --- a/src/mongo/db/SConscript +++ b/src/mongo/db/SConscript @@ -559,6 +559,7 @@ env.Library( '$BUILD_DIR/mongo/bson/mutable/mutable_bson', '$BUILD_DIR/mongo/rpc/rpc', '$BUILD_DIR/mongo/rpc/command_status', + '$BUILD_DIR/mongo/rpc/rewrite_state_change_errors', 'audit', 'command_generic_argument', 'commands/server_status_core', diff --git a/src/mongo/db/commands.cpp b/src/mongo/db/commands.cpp index a1077c0e45d..8cf54352bfa 100644 --- a/src/mongo/db/commands.cpp +++ b/src/mongo/db/commands.cpp @@ -60,6 +60,7 @@ #include "mongo/rpc/metadata/client_metadata.h" #include "mongo/rpc/op_msg_rpc_impls.h" #include "mongo/rpc/protocol.h" +#include "mongo/rpc/rewrite_state_change_errors.h" #include "mongo/rpc/write_concern_error_detail.h" #include "mongo/s/stale_exception.h" #include "mongo/util/fail_point.h" @@ -635,6 +636,13 @@ void CommandHelpers::evaluateFailCommandFailPoint(OperationContext* opCtx, const Command* cmd = invocation->definition(); failCommand.executeIf( [&](const BSONObj& data) { + // State change codes are rewritten on the way out of a `mongos` + // server. Errors injected via failpoint manipulation are normally + // exempt from this. However, we provide an override option so they + // can be made subject to rewriting if that's really necessary. + if (bool b; !bsonExtractBooleanField(data, "allowRewriteStateChange", &b).isOK() || !b) + rpc::RewriteStateChangeErrors::setEnabled(opCtx, false); + if (data.hasField(kErrorLabelsFieldName) && data[kErrorLabelsFieldName].type() == Array) { // Propagate error labels specified in the failCommand failpoint to the diff --git a/src/mongo/db/commands/SConscript b/src/mongo/db/commands/SConscript index 7deb9940b97..536c5ddc949 100644 --- a/src/mongo/db/commands/SConscript +++ b/src/mongo/db/commands/SConscript @@ -308,6 +308,7 @@ env.Library( '$BUILD_DIR/mongo/db/query/command_request_response', '$BUILD_DIR/mongo/db/query_exec', '$BUILD_DIR/mongo/db/repl/replica_set_messages', + '$BUILD_DIR/mongo/rpc/rewrite_state_change_errors', '$BUILD_DIR/mongo/db/rw_concern_d', '$BUILD_DIR/mongo/db/stats/counters', '$BUILD_DIR/mongo/db/stats/server_read_concern_write_concern_metrics', diff --git a/src/mongo/db/commands/getmore_cmd.cpp b/src/mongo/db/commands/getmore_cmd.cpp index 9b57b3c903a..b801ba25dbc 100644 --- a/src/mongo/db/commands/getmore_cmd.cpp +++ b/src/mongo/db/commands/getmore_cmd.cpp @@ -34,6 +34,7 @@ #include <memory> #include <string> +#include "mongo/bson/util/bson_extract.h" #include "mongo/db/auth/authorization_session.h" #include "mongo/db/catalog/collection.h" #include "mongo/db/client.h" @@ -58,6 +59,7 @@ #include "mongo/db/stats/counters.h" #include "mongo/db/stats/top.h" #include "mongo/logv2/log.h" +#include "mongo/rpc/rewrite_state_change_errors.h" #include "mongo/s/chunk_version.h" #include "mongo/util/fail_point.h" #include "mongo/util/scopeguard.h" @@ -557,7 +559,11 @@ public: // If the 'failGetMoreAfterCursorCheckout' failpoint is enabled, throw an exception with // the given 'errorCode' value, or ErrorCodes::InternalError if 'errorCode' is omitted. failGetMoreAfterCursorCheckout.executeIf( - [](const BSONObj& data) { + [&](const BSONObj& data) { + if (bool b; + !bsonExtractBooleanField(data, "allowRewriteStateChange", &b).isOK() || !b) + rpc::RewriteStateChangeErrors::setEnabled(opCtx, false); + auto errorCode = (data["errorCode"] ? data["errorCode"].safeNumberLong() : ErrorCodes::InternalError); uasserted(errorCode, "Hit the 'failGetMoreAfterCursorCheckout' failpoint"); diff --git a/src/mongo/rpc/SConscript b/src/mongo/rpc/SConscript index 82066ced6a2..0140da67a88 100644 --- a/src/mongo/rpc/SConscript +++ b/src/mongo/rpc/SConscript @@ -71,6 +71,23 @@ env.Library( ], LIBDEPS_PRIVATE=[ '$BUILD_DIR/mongo/idl/server_parameter', + 'rewrite_state_change_errors', + ], +) + +env.Library( + target='rewrite_state_change_errors', + source=[ + 'rewrite_state_change_errors.cpp', + 'rewrite_state_change_errors_server_parameter.idl', + ], + LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/base', + '$BUILD_DIR/mongo/bson/mutable/mutable_bson', + '$BUILD_DIR/mongo/db/service_context', + '$BUILD_DIR/mongo/rpc/protocol', + '$BUILD_DIR/mongo/s/is_mongos', + '$BUILD_DIR/third_party/shim_pcrecpp', ], ) @@ -167,12 +184,14 @@ env.CppUnitTest( 'op_msg_test.cpp', 'protocol_test.cpp', 'reply_builder_test.cpp', + 'rewrite_state_change_errors_test.cpp', ], LIBDEPS=[ '$BUILD_DIR/mongo/client/clientdriver_minimal', '$BUILD_DIR/third_party/wiredtiger/wiredtiger_checksum', 'client_metadata', 'metadata', + 'rewrite_state_change_errors', 'rpc', ] ) diff --git a/src/mongo/rpc/rewrite_state_change_errors.cpp b/src/mongo/rpc/rewrite_state_change_errors.cpp new file mode 100644 index 00000000000..9f0fea0c294 --- /dev/null +++ b/src/mongo/rpc/rewrite_state_change_errors.cpp @@ -0,0 +1,235 @@ +/** + * Copyright (C) 2021-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kNetwork + +#include "mongo/rpc/rewrite_state_change_errors.h" + +#include "mongo/platform/basic.h" + +#include <array> +#include <string> + +#include <boost/optional.hpp> +#include <fmt/format.h> +#include <pcrecpp.h> + +#include "mongo/bson/mutable/document.h" +#include "mongo/bson/mutable/element.h" +#include "mongo/db/operation_context.h" +#include "mongo/db/service_context.h" +#include "mongo/logv2/log.h" +#include "mongo/platform/atomic_word.h" +#include "mongo/rpc/message.h" +#include "mongo/rpc/op_msg.h" +#include "mongo/rpc/rewrite_state_change_errors_server_parameter_gen.h" +#include "mongo/s/is_mongos.h" +#include "mongo/util/assert_util.h" +#include "mongo/util/static_immortal.h" + +namespace mongo::rpc { +namespace { + +struct RewriteEnabled { + bool enabled = rewriteStateChangeErrors; +}; + +/** Enable for the entire service. */ +auto enabledForService = ServiceContext::declareDecoration<RewriteEnabled>(); + +/** Enable for a single operation. */ +auto enabledForOperation = OperationContext::declareDecoration<RewriteEnabled>(); + + +/** + * We must replace key phrases in `errmsg` with sensible strings that + * drivers will ignore. Return scrubbed `val`, or no value if it + * doesn't need scrubbing. + * + * See + * https://github.com/mongodb/specifications/blob/master/source/server-discovery-and-monitoring/server-discovery-and-monitoring.rst#not-master-and-node-is-recovering + */ +boost::optional<std::string> scrubErrmsg(StringData val) { + struct Scrub { + pcrecpp::RE pat; + std::string sub; + }; + static const StaticImmortal scrubs = std::array{ + Scrub{pcrecpp::RE("not master"), "(NOT_PRIMARY)"}, + Scrub{pcrecpp::RE("node is recovering"), "(NODE_IS_RECOVERING)"}, + }; + // Fast scan for the common case that no key phrase is present. + static const StaticImmortal fastScan = [] { + std::string pat; + StringData sep; + auto out = std::back_inserter(pat); + for (const auto& scrub : *scrubs) { + out = format_to(out, FMT_STRING("{}({})"), sep, scrub.pat.pattern()); + sep = "|"_sd; + } + return pcrecpp::RE(pat); + }(); + + pcrecpp::StringPiece pcreVal(val.rawData(), val.size()); + + if (fastScan->PartialMatch(pcreVal)) { + std::string s{val}; + bool didSub = false; + for (auto&& scrub : *scrubs) { + bool subOk = scrub.pat.GlobalReplace(scrub.sub, &s); + didSub = (didSub || subOk); + } + if (didSub) + return s; + } + return {}; +} + +/** + * Change the {code, codeName, errmsg} fields to cloak a proxied state change + * error. We choose `HostUnreachable` as it is retryable but doesn't induce an + * SDAM state change. + */ +void editErrorNode(mutablebson::Element&& node) { + if (auto codeElement = node["code"]; codeElement.ok()) { + static constexpr auto newCode = ErrorCodes::HostUnreachable; + uassertStatusOK(codeElement.setValueInt(newCode)); + // If there's a corresponding `codeName`, replace it to match. + if (auto codeName = node["codeName"]; codeName.ok()) + uassertStatusOK(codeName.setValueString(ErrorCodes::errorString(newCode))); + } + if (auto errmsg = node["errmsg"]; errmsg.ok() && errmsg.isType(String)) + if (auto scrubbed = scrubErrmsg(errmsg.getValueString())) + uassertStatusOK(errmsg.setValueString(*scrubbed)); +} + +/** + * If `node` contains a numeric "code" field that indicates a state change, + * returns the value of that code field as an `ErrorCodes::Error`. Otherwise + * returns a disengaged optional. + * + * There are two categories of state change errors: ShutdownError and + * NotPrimaryError. + */ +boost::optional<ErrorCodes::Error> needsRewrite(ServiceContext* sc, const BSONObj& node) { + int32_t intCode{}; + if (!node["code"].coerce(&intCode)) + return {}; + ErrorCodes::Error ec{intCode}; + + if (ErrorCodes::isA<ErrorCategory::NotPrimaryError>(ec)) { + return ec; + } + + // ShutdownError codes are correct if this server is also in shutdown. If + // this server is shutting down, then even if the shutdown error didn't + // originate from this server, it might as well have. + if (ErrorCodes::isA<ErrorCategory::ShutdownError>(ec) && !sc->getKillAllOperations()) { + return ec; + } + + return {}; +} + +/** + * Returns a copy of doc with errors rewritten to cloak state change errors if + * necessary. Returns disengaged optional if no changes were necessary. + */ +boost::optional<BSONObj> rewriteDocument(const BSONObj& doc, OperationContext* opCtx) { + boost::optional<mutablebson::Document> mutableDoc; + auto lazyMutableRoot = [&] { + if (!mutableDoc) + mutableDoc.emplace(doc); + return mutableDoc->root(); + }; + + ServiceContext* sc = opCtx->getServiceContext(); + + // Skip unless there's an "ok" element value equivalent to expected 0 or 1. + // "ok" is conventionally a NumberDouble, but coerce since this is unspecified. + double okValue = 0; + if (!doc.getField("ok").coerce(&okValue) || (okValue != 0 && okValue != 1)) + return {}; // Skip: missing or unusable "ok" field. + + boost::optional<ErrorCodes::Error> oldCode; + + // The root of the doc is an error-bearing node if not ok. + if (okValue == 0 && (oldCode = needsRewrite(sc, doc))) + editErrorNode(lazyMutableRoot()); + + // The `writeErrors` and `writeConcernError` nodes might need editing. + // `writeErrors` is an array of error-bearing nodes like the doc root. + if (const auto& we = doc["writeErrors"]; we.type() == Array) { + size_t idx = 0; + BSONObj bArr = we.Obj(); + for (auto ai = bArr.begin(); ai != bArr.end(); ++ai, ++idx) + if (ai->type() == Object && (oldCode = needsRewrite(sc, ai->Obj()))) + editErrorNode(lazyMutableRoot()["writeErrors"][idx]); + } + + // `writeConcernError` is a single error-bearing node. + if (const auto& wce = doc["writeConcernError"]; wce.type() == Object) { + if ((oldCode = needsRewrite(sc, wce.Obj()))) + editErrorNode(lazyMutableRoot()["writeConcernError"]); + } + + if (mutableDoc) { + LOGV2_DEBUG(1, 5054900, "Rewrote state change error", "code"_attr = oldCode); + return mutableDoc->getObject(); + } + return {}; +} + +} // namespace + + +bool RewriteStateChangeErrors::getEnabled(OperationContext* opCtx) { + return enabledForOperation(opCtx).enabled; +} + +void RewriteStateChangeErrors::setEnabled(OperationContext* opCtx, bool e) { + enabledForOperation(opCtx).enabled = e; +} + +bool RewriteStateChangeErrors::getEnabled(ServiceContext* sc) { + return enabledForService(sc).enabled; +} + +void RewriteStateChangeErrors::setEnabled(ServiceContext* sc, bool e) { + enabledForService(sc).enabled = e; +} + +boost::optional<BSONObj> RewriteStateChangeErrors::rewrite(BSONObj doc, OperationContext* opCtx) { + auto sc = opCtx->getServiceContext(); + if (!isMongos() || (sc && !getEnabled(sc)) || !getEnabled(opCtx)) + return {}; + return rewriteDocument(doc, opCtx); +} + +} // namespace mongo::rpc diff --git a/src/mongo/rpc/rewrite_state_change_errors.h b/src/mongo/rpc/rewrite_state_change_errors.h new file mode 100644 index 00000000000..9edccd7e999 --- /dev/null +++ b/src/mongo/rpc/rewrite_state_change_errors.h @@ -0,0 +1,95 @@ +/** + * Copyright (C) 2021-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include <boost/optional.hpp> + +#include "mongo/db/operation_context.h" +#include "mongo/db/service_context.h" +#include "mongo/rpc/message.h" + +namespace mongo::rpc { + +class RewriteStateChangeErrors { +public: + /** + * Enable/disable for an entire server. + * The default is determined by server parameter "rewriteStateChangeErrors". + */ + static bool getEnabled(ServiceContext* sc); + static void setEnabled(ServiceContext* sc, bool e); + + /** Enable/disable only for a single operation. */ + static bool getEnabled(OperationContext* opCtx); + static void setEnabled(OperationContext* opCtx, bool e); + + /** + * Transforms an outgoing message to conditionally mask "state change" errors, + * which are errors that cause a client to change its connection to the host + * that sent it, such as marking it "Unknown". A shutdown error that's emitted + * by a proxy server (e.g. mongos) can be misinterpreted by a naive client to + * be indicative of the proxy server shutting down. So this simple rewrite + * scheme attempts to mask those errors on the way out. + * + * Return a disengaged optional if no rewrite is needed. + * Otherwise, returns a copy of doc with the errors rewritten. + * + * Error-bearing subobjects are places in the document where a `code` element + * can indicate an error status. When rewriting, any error-bearing subobjects + * are examined for a `code` element. + * + * In an error document (with "ok" mapped to 0.0), only the root element is + * error-bearing. In a success document (where "ok" is mapped to 1.0), the + * `writeConcernError` subobject and all the subobjects in the `writeErrors` + * BSON array are error-bearing subobjects. + * + * Rewriting occurs only if all of these conditions are met: + * + * - This feature wasn't deselected with a `setEnabled` call, either on the + * `opCtx` or for more widely for its associated ServiceContext. + * This will happen with failpoint-induced errors or with "hello" commands, + * both of which are exempt from state change error rewriting. + * + * - doc contains no error nodes that need rewriting. + * - NotPrimaryError codes always need rewriting. + * - ShutdownError codes need rewriting unless this server is + * shutting down. + * + * Any rewritten `code` is replaced by `HostUnreachable`, and associated + * `codeName` replaced to be consistent with the new code. Additionally, the + * `errmsg` in the subobject has all occurrences of the key phrases "not + * master" and "node is recovering" replaced with "(NOT_PRIMARY)" and + * "(NODE_IS_RECOVERING)" respectively so that client state changes based on + * the presence of these legacy strings are suppressed. + */ + static boost::optional<BSONObj> rewrite(BSONObj doc, OperationContext* opCtx); +}; + +} // namespace mongo::rpc diff --git a/src/mongo/rpc/rewrite_state_change_errors_server_parameter.idl b/src/mongo/rpc/rewrite_state_change_errors_server_parameter.idl new file mode 100644 index 00000000000..cdb8fe44669 --- /dev/null +++ b/src/mongo/rpc/rewrite_state_change_errors_server_parameter.idl @@ -0,0 +1,40 @@ +# Copyright (C) 2021-present MongoDB, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the Server Side Public License, version 1, +# as published by MongoDB, Inc. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Server Side Public License for more details. +# +# You should have received a copy of the Server Side Public License +# along with this program. If not, see +# <http://www.mongodb.com/licensing/server-side-public-license>. +# +# As a special exception, the copyright holders give permission to link the +# code of portions of this program with the OpenSSL library under certain +# conditions as described in each individual source file and distribute +# linked combinations including the program with the OpenSSL library. You +# must comply with the Server Side Public License in all respects for +# all of the code used other than as permitted herein. If you modify file(s) +# with this exception, you may extend this exception to your version of the +# file(s), but you are not obligated to do so. If you do not wish to do so, +# delete this exception statement from your version. If you delete this +# exception statement from all source files in the program, then also delete +# it in the license file. +# + +global: + cpp_namespace: "mongo::rpc" + +server_parameters: + # Kill switch just in case this behavior is slow or breaks something. + rewriteStateChangeErrors: + description: >- + A failsafe to disable mongos state change error rewriting. + set_at: startup + cpp_vartype: bool + cpp_varname: rewriteStateChangeErrors + default: true diff --git a/src/mongo/rpc/rewrite_state_change_errors_test.cpp b/src/mongo/rpc/rewrite_state_change_errors_test.cpp new file mode 100644 index 00000000000..66cda0cd0c2 --- /dev/null +++ b/src/mongo/rpc/rewrite_state_change_errors_test.cpp @@ -0,0 +1,181 @@ +/** + * Copyright (C) 2021-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/rpc/rewrite_state_change_errors.h" + +#include "mongo/base/error_codes.h" +#include "mongo/bson/bsonobj.h" +#include "mongo/bson/bsonobjbuilder.h" +#include "mongo/db/service_context.h" +#include "mongo/rpc/message.h" +#include "mongo/rpc/op_msg.h" +#include "mongo/s/is_mongos.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::rpc { +namespace { + +class RewriteStateChangeErrorsTest : public unittest::Test { +public: + void setUp() override { + _savedIsMongos = isMongos(); + setMongos(true); // whole feature only happens on mongos + RewriteStateChangeErrors::setEnabled(&*sc, true); + } + + void tearDown() override { + setMongos(_savedIsMongos); + } + + /** Run rewrite on `obj` and return what it was remapped to if anything. */ + BSONObj rewriteObj(const BSONObj& obj) { + if (auto newDoc = RewriteStateChangeErrors::rewrite(obj, &*opCtx)) + return *newDoc; + return obj; + } + + /** Make an error node corresponding to `ec`. */ + BSONObj errorObject(ErrorCodes::Error ec) { + BSONObjBuilder bob; + if (ec == ErrorCodes::OK) + bob.append("ok", 1.); + else + bob.append("ok", 0.) + .append("code", static_cast<int>(ec)) + .append("codeName", ErrorCodes::errorString(ec)); + return bob.obj(); + } + + /** Make an error node corresponding to `ec` with `errmsg`. */ + BSONObj errorObject(ErrorCodes::Error ec, std::string errmsg) { + return BSONObjBuilder(errorObject(ec)).append("errmsg", errmsg).obj(); + } + + /** A few codes and what we expect them to be rewritten to. */ + struct InOutCode { + ErrorCodes::Error in; + ErrorCodes::Error out; + }; + static constexpr InOutCode errorCodeScenarios[] = { + {ErrorCodes::InterruptedAtShutdown, ErrorCodes::HostUnreachable}, + {ErrorCodes::ShutdownInProgress, ErrorCodes::HostUnreachable}, + {ErrorCodes::OK, ErrorCodes::OK}, + {ErrorCodes::BadValue, ErrorCodes::BadValue}, + }; + + ServiceContext::UniqueServiceContext sc = ServiceContext::make(); + ServiceContext::UniqueClient cc = sc->makeClient("test", nullptr); + ServiceContext::UniqueOperationContext opCtx = sc->makeOperationContext(cc.get()); + +private: + bool _savedIsMongos; +}; + +// Rewrite Shutdown errors received from proxied commands. +TEST_F(RewriteStateChangeErrorsTest, Enabled) { + for (auto&& [in, out] : errorCodeScenarios) { + ASSERT_BSONOBJ_EQ(rewriteObj(errorObject(in)), errorObject(out)); + } +} + +// Check that rewrite behavior can be disabled per-ServiceContext. +TEST_F(RewriteStateChangeErrorsTest, Disabled) { + RewriteStateChangeErrors::setEnabled(&*sc, false); + ASSERT_BSONOBJ_EQ(rewriteObj(errorObject(ErrorCodes::InterruptedAtShutdown)), + errorObject(ErrorCodes::InterruptedAtShutdown)); +} + +// Check that rewrite behavior can be disabled per-opCtx. +TEST_F(RewriteStateChangeErrorsTest, DisabledOpCtx) { + RewriteStateChangeErrors::setEnabled(&*opCtx, false); + ASSERT_BSONOBJ_EQ(rewriteObj(errorObject(ErrorCodes::InterruptedAtShutdown)), + errorObject(ErrorCodes::InterruptedAtShutdown)); +} + +// If locally shutting down, then shutdown errors must not be rewritten. +TEST_F(RewriteStateChangeErrorsTest, LocalShutdown) { + sc->setKillAllOperations(); + ASSERT_BSONOBJ_EQ(rewriteObj(errorObject(ErrorCodes::InterruptedAtShutdown)), + errorObject(ErrorCodes::InterruptedAtShutdown)); +} + +TEST_F(RewriteStateChangeErrorsTest, RewriteErrmsg) { + const std::pair<std::string, std::string> scenarios[] = { + {"not master", "(NOT_PRIMARY)"}, + {"node is recovering", "(NODE_IS_RECOVERING)"}, + {"NOT master", "NOT master"}, + {"", ""}, + {" not masternot master ", " (NOT_PRIMARY)(NOT_PRIMARY) "}, + {"not masternode is recovering", "(NOT_PRIMARY)(NODE_IS_RECOVERING)"}, + }; + for (auto&& io : scenarios) { + ASSERT_BSONOBJ_EQ(rewriteObj(errorObject(ErrorCodes::InterruptedAtShutdown, io.first)), + errorObject(ErrorCodes::HostUnreachable, io.second)); + } +} + +// Can find and rewrite the `writeConcernError` in an `ok:1` response. +TEST_F(RewriteStateChangeErrorsTest, WriteConcernError) { + // Make an OK object, and append a `writeConcernError` subobject bearing the `ec` error. + auto wceObject = [&](ErrorCodes::Error ec) { + return BSONObjBuilder(errorObject(ErrorCodes::OK)) + .append("writeConcernError", errorObject(ec)) + .obj(); + }; + for (auto&& [in, out] : errorCodeScenarios) { + ASSERT_BSONOBJ_EQ(rewriteObj(wceObject(in)), wceObject(out)); + } +} + +// Can find and rewrite the `writeErrors` array elements in an `ok:1` response. +TEST_F(RewriteStateChangeErrorsTest, WriteErrors) { + // Make an OK object, and append a `writeErrors` subobject bearing the `ec` errors. + auto weObject = [&](std::vector<ErrorCodes::Error> ecVec) { + BSONObjBuilder bob(errorObject(ErrorCodes::OK)); + { + BSONArrayBuilder bab(bob.subarrayStart("writeErrors")); + for (ErrorCodes::Error ec : ecVec) + bab.append(errorObject(ec)); + } + return bob.obj(); + }; + for (auto&& [in, out] : errorCodeScenarios) { + ASSERT_BSONOBJ_EQ(rewriteObj(weObject({in})), weObject({out})); + } + // Now try all the errorCodeScenarios as a single array of `writeErrors`. + std::vector<ErrorCodes::Error> allIn, allOut; + for (auto&& [in, out] : errorCodeScenarios) { + allIn.push_back(in); + allOut.push_back(out); + } + ASSERT_BSONOBJ_EQ(rewriteObj(weObject(allIn)), weObject(allOut)); +} + +} // namespace +} // namespace mongo::rpc diff --git a/src/mongo/s/commands/SConscript b/src/mongo/s/commands/SConscript index 6e8f8b105f2..31d6cade799 100644 --- a/src/mongo/s/commands/SConscript +++ b/src/mongo/s/commands/SConscript @@ -131,6 +131,7 @@ env.Library( '$BUILD_DIR/mongo/executor/async_multicaster', '$BUILD_DIR/mongo/idl/server_parameter', '$BUILD_DIR/mongo/rpc/client_metadata', + '$BUILD_DIR/mongo/rpc/rewrite_state_change_errors', '$BUILD_DIR/mongo/s/query/cluster_aggregate', '$BUILD_DIR/mongo/s/query/cluster_client_cursor', '$BUILD_DIR/mongo/s/sharding_api', diff --git a/src/mongo/s/commands/strategy.cpp b/src/mongo/s/commands/strategy.cpp index 53c612d8eff..e0e8bad3acc 100644 --- a/src/mongo/s/commands/strategy.cpp +++ b/src/mongo/s/commands/strategy.cpp @@ -71,6 +71,7 @@ #include "mongo/rpc/metadata/tracking_metadata.h" #include "mongo/rpc/op_msg.h" #include "mongo/rpc/op_msg_rpc_impls.h" +#include "mongo/rpc/rewrite_state_change_errors.h" #include "mongo/s/catalog_cache.h" #include "mongo/s/client/parallel.h" #include "mongo/s/client/shard_connection.h" @@ -1113,6 +1114,11 @@ DbResponse Strategy::clientCommand(OperationContext* opCtx, const Message& m) { dbResponse.nextInvocation = reply->getNextInvocation(); } } + if (auto doc = + rpc::RewriteStateChangeErrors::rewrite(reply->getBodyBuilder().asTempObj(), opCtx)) { + reply->reset(); + reply->getBodyBuilder().appendElements(*doc); + } dbResponse.response = reply->done(); return dbResponse; diff --git a/src/mongo/shell/error_codes.tpl.js b/src/mongo/shell/error_codes.tpl.js index f4299deae3f..21f9143e2f7 100644 --- a/src/mongo/shell/error_codes.tpl.js +++ b/src/mongo/shell/error_codes.tpl.js @@ -81,3 +81,38 @@ ErrorCodes.is${cat.name} = function(err) { } }; //#end for + +/** + * Returns true if the mongos `s` is expected to rewrite state change errors. + * This is determined by a `getParameter` of the "rewriteStateChangeErrors" option. + * Failure would indicate a mongos that doesn't support the rewrite feature. + */ +ErrorCodes.probeMongosRewrite = function(s) { + const param = "rewriteStateChangeErrors"; + const res = s.adminCommand({getParameter: 1, [param]: 1}); + if (!(param in res)) { + print("Mongos did not return a " + param + " server parameter: " + tojson(res)); + return false; + } + return res[param]; +}; + +/** + * Returns the ErrorCode to which the specified mongos `s` would remap the + * specified `err`. Mongos normally rewrites connection state change errors, + * unless it is shutting down or the code was injected by a mongos failpoint. + * + * The optional `doesRewrite` bool parameter provides a mechanism to bypass the + * probe, which may be useful if the probe would interfere with a test's + * operation. + */ +ErrorCodes.doMongosRewrite = function(s, err, doesRewrite) { + if (doesRewrite === undefined) + doesRewrite = ErrorCodes.probeMongosRewrite(s); + if (doesRewrite) { + if (ErrorCodes.isNotPrimaryError(err) || ErrorCodes.isShutdownError(err)) { + return ErrorCodes.HostUnreachable; + } + } + return err; +}; |