From 4a311668ac834b6e363a44c9f00cad9d4790288f Mon Sep 17 00:00:00 2001 From: Billy Donahue Date: Mon, 10 May 2021 18:01:22 -0400 Subject: SERVER-50549 transform state-changing errors returned by mongos --- src/mongo/rpc/SConscript | 19 ++ src/mongo/rpc/rewrite_state_change_errors.cpp | 235 +++++++++++++++++++++ src/mongo/rpc/rewrite_state_change_errors.h | 95 +++++++++ ...ewrite_state_change_errors_server_parameter.idl | 40 ++++ src/mongo/rpc/rewrite_state_change_errors_test.cpp | 181 ++++++++++++++++ 5 files changed, 570 insertions(+) create mode 100644 src/mongo/rpc/rewrite_state_change_errors.cpp create mode 100644 src/mongo/rpc/rewrite_state_change_errors.h create mode 100644 src/mongo/rpc/rewrite_state_change_errors_server_parameter.idl create mode 100644 src/mongo/rpc/rewrite_state_change_errors_test.cpp (limited to 'src/mongo/rpc') diff --git a/src/mongo/rpc/SConscript b/src/mongo/rpc/SConscript index c16d1e116c4..dd7d6205c84 100644 --- a/src/mongo/rpc/SConscript +++ b/src/mongo/rpc/SConscript @@ -74,6 +74,23 @@ env.Library( ], LIBDEPS_PRIVATE=[ '$BUILD_DIR/mongo/idl/server_parameter', + 'rewrite_state_change_errors', + ], +) + +env.Library( + target='rewrite_state_change_errors', + source=[ + 'rewrite_state_change_errors.cpp', + 'rewrite_state_change_errors_server_parameter.idl', + ], + LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/base', + '$BUILD_DIR/mongo/bson/mutable/mutable_bson', + '$BUILD_DIR/mongo/db/service_context', + '$BUILD_DIR/mongo/rpc/protocol', + '$BUILD_DIR/mongo/s/is_mongos', + '$BUILD_DIR/third_party/shim_pcrecpp', ], ) @@ -170,12 +187,14 @@ if wiredtiger: 'op_msg_test.cpp', 'protocol_test.cpp', 'reply_builder_test.cpp', + 'rewrite_state_change_errors_test.cpp', ], LIBDEPS=[ '$BUILD_DIR/mongo/client/clientdriver_minimal', '$BUILD_DIR/third_party/wiredtiger/wiredtiger_checksum', 'client_metadata', 'metadata', + 'rewrite_state_change_errors', 'rpc', ] ) diff --git a/src/mongo/rpc/rewrite_state_change_errors.cpp b/src/mongo/rpc/rewrite_state_change_errors.cpp new file mode 100644 index 00000000000..9f0fea0c294 --- /dev/null +++ b/src/mongo/rpc/rewrite_state_change_errors.cpp @@ -0,0 +1,235 @@ +/** + * Copyright (C) 2021-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kNetwork + +#include "mongo/rpc/rewrite_state_change_errors.h" + +#include "mongo/platform/basic.h" + +#include +#include + +#include +#include +#include + +#include "mongo/bson/mutable/document.h" +#include "mongo/bson/mutable/element.h" +#include "mongo/db/operation_context.h" +#include "mongo/db/service_context.h" +#include "mongo/logv2/log.h" +#include "mongo/platform/atomic_word.h" +#include "mongo/rpc/message.h" +#include "mongo/rpc/op_msg.h" +#include "mongo/rpc/rewrite_state_change_errors_server_parameter_gen.h" +#include "mongo/s/is_mongos.h" +#include "mongo/util/assert_util.h" +#include "mongo/util/static_immortal.h" + +namespace mongo::rpc { +namespace { + +struct RewriteEnabled { + bool enabled = rewriteStateChangeErrors; +}; + +/** Enable for the entire service. */ +auto enabledForService = ServiceContext::declareDecoration(); + +/** Enable for a single operation. */ +auto enabledForOperation = OperationContext::declareDecoration(); + + +/** + * We must replace key phrases in `errmsg` with sensible strings that + * drivers will ignore. Return scrubbed `val`, or no value if it + * doesn't need scrubbing. + * + * See + * https://github.com/mongodb/specifications/blob/master/source/server-discovery-and-monitoring/server-discovery-and-monitoring.rst#not-master-and-node-is-recovering + */ +boost::optional scrubErrmsg(StringData val) { + struct Scrub { + pcrecpp::RE pat; + std::string sub; + }; + static const StaticImmortal scrubs = std::array{ + Scrub{pcrecpp::RE("not master"), "(NOT_PRIMARY)"}, + Scrub{pcrecpp::RE("node is recovering"), "(NODE_IS_RECOVERING)"}, + }; + // Fast scan for the common case that no key phrase is present. + static const StaticImmortal fastScan = [] { + std::string pat; + StringData sep; + auto out = std::back_inserter(pat); + for (const auto& scrub : *scrubs) { + out = format_to(out, FMT_STRING("{}({})"), sep, scrub.pat.pattern()); + sep = "|"_sd; + } + return pcrecpp::RE(pat); + }(); + + pcrecpp::StringPiece pcreVal(val.rawData(), val.size()); + + if (fastScan->PartialMatch(pcreVal)) { + std::string s{val}; + bool didSub = false; + for (auto&& scrub : *scrubs) { + bool subOk = scrub.pat.GlobalReplace(scrub.sub, &s); + didSub = (didSub || subOk); + } + if (didSub) + return s; + } + return {}; +} + +/** + * Change the {code, codeName, errmsg} fields to cloak a proxied state change + * error. We choose `HostUnreachable` as it is retryable but doesn't induce an + * SDAM state change. + */ +void editErrorNode(mutablebson::Element&& node) { + if (auto codeElement = node["code"]; codeElement.ok()) { + static constexpr auto newCode = ErrorCodes::HostUnreachable; + uassertStatusOK(codeElement.setValueInt(newCode)); + // If there's a corresponding `codeName`, replace it to match. + if (auto codeName = node["codeName"]; codeName.ok()) + uassertStatusOK(codeName.setValueString(ErrorCodes::errorString(newCode))); + } + if (auto errmsg = node["errmsg"]; errmsg.ok() && errmsg.isType(String)) + if (auto scrubbed = scrubErrmsg(errmsg.getValueString())) + uassertStatusOK(errmsg.setValueString(*scrubbed)); +} + +/** + * If `node` contains a numeric "code" field that indicates a state change, + * returns the value of that code field as an `ErrorCodes::Error`. Otherwise + * returns a disengaged optional. + * + * There are two categories of state change errors: ShutdownError and + * NotPrimaryError. + */ +boost::optional needsRewrite(ServiceContext* sc, const BSONObj& node) { + int32_t intCode{}; + if (!node["code"].coerce(&intCode)) + return {}; + ErrorCodes::Error ec{intCode}; + + if (ErrorCodes::isA(ec)) { + return ec; + } + + // ShutdownError codes are correct if this server is also in shutdown. If + // this server is shutting down, then even if the shutdown error didn't + // originate from this server, it might as well have. + if (ErrorCodes::isA(ec) && !sc->getKillAllOperations()) { + return ec; + } + + return {}; +} + +/** + * Returns a copy of doc with errors rewritten to cloak state change errors if + * necessary. Returns disengaged optional if no changes were necessary. + */ +boost::optional rewriteDocument(const BSONObj& doc, OperationContext* opCtx) { + boost::optional mutableDoc; + auto lazyMutableRoot = [&] { + if (!mutableDoc) + mutableDoc.emplace(doc); + return mutableDoc->root(); + }; + + ServiceContext* sc = opCtx->getServiceContext(); + + // Skip unless there's an "ok" element value equivalent to expected 0 or 1. + // "ok" is conventionally a NumberDouble, but coerce since this is unspecified. + double okValue = 0; + if (!doc.getField("ok").coerce(&okValue) || (okValue != 0 && okValue != 1)) + return {}; // Skip: missing or unusable "ok" field. + + boost::optional oldCode; + + // The root of the doc is an error-bearing node if not ok. + if (okValue == 0 && (oldCode = needsRewrite(sc, doc))) + editErrorNode(lazyMutableRoot()); + + // The `writeErrors` and `writeConcernError` nodes might need editing. + // `writeErrors` is an array of error-bearing nodes like the doc root. + if (const auto& we = doc["writeErrors"]; we.type() == Array) { + size_t idx = 0; + BSONObj bArr = we.Obj(); + for (auto ai = bArr.begin(); ai != bArr.end(); ++ai, ++idx) + if (ai->type() == Object && (oldCode = needsRewrite(sc, ai->Obj()))) + editErrorNode(lazyMutableRoot()["writeErrors"][idx]); + } + + // `writeConcernError` is a single error-bearing node. + if (const auto& wce = doc["writeConcernError"]; wce.type() == Object) { + if ((oldCode = needsRewrite(sc, wce.Obj()))) + editErrorNode(lazyMutableRoot()["writeConcernError"]); + } + + if (mutableDoc) { + LOGV2_DEBUG(1, 5054900, "Rewrote state change error", "code"_attr = oldCode); + return mutableDoc->getObject(); + } + return {}; +} + +} // namespace + + +bool RewriteStateChangeErrors::getEnabled(OperationContext* opCtx) { + return enabledForOperation(opCtx).enabled; +} + +void RewriteStateChangeErrors::setEnabled(OperationContext* opCtx, bool e) { + enabledForOperation(opCtx).enabled = e; +} + +bool RewriteStateChangeErrors::getEnabled(ServiceContext* sc) { + return enabledForService(sc).enabled; +} + +void RewriteStateChangeErrors::setEnabled(ServiceContext* sc, bool e) { + enabledForService(sc).enabled = e; +} + +boost::optional RewriteStateChangeErrors::rewrite(BSONObj doc, OperationContext* opCtx) { + auto sc = opCtx->getServiceContext(); + if (!isMongos() || (sc && !getEnabled(sc)) || !getEnabled(opCtx)) + return {}; + return rewriteDocument(doc, opCtx); +} + +} // namespace mongo::rpc diff --git a/src/mongo/rpc/rewrite_state_change_errors.h b/src/mongo/rpc/rewrite_state_change_errors.h new file mode 100644 index 00000000000..9edccd7e999 --- /dev/null +++ b/src/mongo/rpc/rewrite_state_change_errors.h @@ -0,0 +1,95 @@ +/** + * Copyright (C) 2021-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include + +#include "mongo/db/operation_context.h" +#include "mongo/db/service_context.h" +#include "mongo/rpc/message.h" + +namespace mongo::rpc { + +class RewriteStateChangeErrors { +public: + /** + * Enable/disable for an entire server. + * The default is determined by server parameter "rewriteStateChangeErrors". + */ + static bool getEnabled(ServiceContext* sc); + static void setEnabled(ServiceContext* sc, bool e); + + /** Enable/disable only for a single operation. */ + static bool getEnabled(OperationContext* opCtx); + static void setEnabled(OperationContext* opCtx, bool e); + + /** + * Transforms an outgoing message to conditionally mask "state change" errors, + * which are errors that cause a client to change its connection to the host + * that sent it, such as marking it "Unknown". A shutdown error that's emitted + * by a proxy server (e.g. mongos) can be misinterpreted by a naive client to + * be indicative of the proxy server shutting down. So this simple rewrite + * scheme attempts to mask those errors on the way out. + * + * Return a disengaged optional if no rewrite is needed. + * Otherwise, returns a copy of doc with the errors rewritten. + * + * Error-bearing subobjects are places in the document where a `code` element + * can indicate an error status. When rewriting, any error-bearing subobjects + * are examined for a `code` element. + * + * In an error document (with "ok" mapped to 0.0), only the root element is + * error-bearing. In a success document (where "ok" is mapped to 1.0), the + * `writeConcernError` subobject and all the subobjects in the `writeErrors` + * BSON array are error-bearing subobjects. + * + * Rewriting occurs only if all of these conditions are met: + * + * - This feature wasn't deselected with a `setEnabled` call, either on the + * `opCtx` or for more widely for its associated ServiceContext. + * This will happen with failpoint-induced errors or with "hello" commands, + * both of which are exempt from state change error rewriting. + * + * - doc contains no error nodes that need rewriting. + * - NotPrimaryError codes always need rewriting. + * - ShutdownError codes need rewriting unless this server is + * shutting down. + * + * Any rewritten `code` is replaced by `HostUnreachable`, and associated + * `codeName` replaced to be consistent with the new code. Additionally, the + * `errmsg` in the subobject has all occurrences of the key phrases "not + * master" and "node is recovering" replaced with "(NOT_PRIMARY)" and + * "(NODE_IS_RECOVERING)" respectively so that client state changes based on + * the presence of these legacy strings are suppressed. + */ + static boost::optional rewrite(BSONObj doc, OperationContext* opCtx); +}; + +} // namespace mongo::rpc diff --git a/src/mongo/rpc/rewrite_state_change_errors_server_parameter.idl b/src/mongo/rpc/rewrite_state_change_errors_server_parameter.idl new file mode 100644 index 00000000000..cdb8fe44669 --- /dev/null +++ b/src/mongo/rpc/rewrite_state_change_errors_server_parameter.idl @@ -0,0 +1,40 @@ +# Copyright (C) 2021-present MongoDB, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the Server Side Public License, version 1, +# as published by MongoDB, Inc. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Server Side Public License for more details. +# +# You should have received a copy of the Server Side Public License +# along with this program. If not, see +# . +# +# As a special exception, the copyright holders give permission to link the +# code of portions of this program with the OpenSSL library under certain +# conditions as described in each individual source file and distribute +# linked combinations including the program with the OpenSSL library. You +# must comply with the Server Side Public License in all respects for +# all of the code used other than as permitted herein. If you modify file(s) +# with this exception, you may extend this exception to your version of the +# file(s), but you are not obligated to do so. If you do not wish to do so, +# delete this exception statement from your version. If you delete this +# exception statement from all source files in the program, then also delete +# it in the license file. +# + +global: + cpp_namespace: "mongo::rpc" + +server_parameters: + # Kill switch just in case this behavior is slow or breaks something. + rewriteStateChangeErrors: + description: >- + A failsafe to disable mongos state change error rewriting. + set_at: startup + cpp_vartype: bool + cpp_varname: rewriteStateChangeErrors + default: true diff --git a/src/mongo/rpc/rewrite_state_change_errors_test.cpp b/src/mongo/rpc/rewrite_state_change_errors_test.cpp new file mode 100644 index 00000000000..66cda0cd0c2 --- /dev/null +++ b/src/mongo/rpc/rewrite_state_change_errors_test.cpp @@ -0,0 +1,181 @@ +/** + * Copyright (C) 2021-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/rpc/rewrite_state_change_errors.h" + +#include "mongo/base/error_codes.h" +#include "mongo/bson/bsonobj.h" +#include "mongo/bson/bsonobjbuilder.h" +#include "mongo/db/service_context.h" +#include "mongo/rpc/message.h" +#include "mongo/rpc/op_msg.h" +#include "mongo/s/is_mongos.h" +#include "mongo/unittest/unittest.h" + +namespace mongo::rpc { +namespace { + +class RewriteStateChangeErrorsTest : public unittest::Test { +public: + void setUp() override { + _savedIsMongos = isMongos(); + setMongos(true); // whole feature only happens on mongos + RewriteStateChangeErrors::setEnabled(&*sc, true); + } + + void tearDown() override { + setMongos(_savedIsMongos); + } + + /** Run rewrite on `obj` and return what it was remapped to if anything. */ + BSONObj rewriteObj(const BSONObj& obj) { + if (auto newDoc = RewriteStateChangeErrors::rewrite(obj, &*opCtx)) + return *newDoc; + return obj; + } + + /** Make an error node corresponding to `ec`. */ + BSONObj errorObject(ErrorCodes::Error ec) { + BSONObjBuilder bob; + if (ec == ErrorCodes::OK) + bob.append("ok", 1.); + else + bob.append("ok", 0.) + .append("code", static_cast(ec)) + .append("codeName", ErrorCodes::errorString(ec)); + return bob.obj(); + } + + /** Make an error node corresponding to `ec` with `errmsg`. */ + BSONObj errorObject(ErrorCodes::Error ec, std::string errmsg) { + return BSONObjBuilder(errorObject(ec)).append("errmsg", errmsg).obj(); + } + + /** A few codes and what we expect them to be rewritten to. */ + struct InOutCode { + ErrorCodes::Error in; + ErrorCodes::Error out; + }; + static constexpr InOutCode errorCodeScenarios[] = { + {ErrorCodes::InterruptedAtShutdown, ErrorCodes::HostUnreachable}, + {ErrorCodes::ShutdownInProgress, ErrorCodes::HostUnreachable}, + {ErrorCodes::OK, ErrorCodes::OK}, + {ErrorCodes::BadValue, ErrorCodes::BadValue}, + }; + + ServiceContext::UniqueServiceContext sc = ServiceContext::make(); + ServiceContext::UniqueClient cc = sc->makeClient("test", nullptr); + ServiceContext::UniqueOperationContext opCtx = sc->makeOperationContext(cc.get()); + +private: + bool _savedIsMongos; +}; + +// Rewrite Shutdown errors received from proxied commands. +TEST_F(RewriteStateChangeErrorsTest, Enabled) { + for (auto&& [in, out] : errorCodeScenarios) { + ASSERT_BSONOBJ_EQ(rewriteObj(errorObject(in)), errorObject(out)); + } +} + +// Check that rewrite behavior can be disabled per-ServiceContext. +TEST_F(RewriteStateChangeErrorsTest, Disabled) { + RewriteStateChangeErrors::setEnabled(&*sc, false); + ASSERT_BSONOBJ_EQ(rewriteObj(errorObject(ErrorCodes::InterruptedAtShutdown)), + errorObject(ErrorCodes::InterruptedAtShutdown)); +} + +// Check that rewrite behavior can be disabled per-opCtx. +TEST_F(RewriteStateChangeErrorsTest, DisabledOpCtx) { + RewriteStateChangeErrors::setEnabled(&*opCtx, false); + ASSERT_BSONOBJ_EQ(rewriteObj(errorObject(ErrorCodes::InterruptedAtShutdown)), + errorObject(ErrorCodes::InterruptedAtShutdown)); +} + +// If locally shutting down, then shutdown errors must not be rewritten. +TEST_F(RewriteStateChangeErrorsTest, LocalShutdown) { + sc->setKillAllOperations(); + ASSERT_BSONOBJ_EQ(rewriteObj(errorObject(ErrorCodes::InterruptedAtShutdown)), + errorObject(ErrorCodes::InterruptedAtShutdown)); +} + +TEST_F(RewriteStateChangeErrorsTest, RewriteErrmsg) { + const std::pair scenarios[] = { + {"not master", "(NOT_PRIMARY)"}, + {"node is recovering", "(NODE_IS_RECOVERING)"}, + {"NOT master", "NOT master"}, + {"", ""}, + {" not masternot master ", " (NOT_PRIMARY)(NOT_PRIMARY) "}, + {"not masternode is recovering", "(NOT_PRIMARY)(NODE_IS_RECOVERING)"}, + }; + for (auto&& io : scenarios) { + ASSERT_BSONOBJ_EQ(rewriteObj(errorObject(ErrorCodes::InterruptedAtShutdown, io.first)), + errorObject(ErrorCodes::HostUnreachable, io.second)); + } +} + +// Can find and rewrite the `writeConcernError` in an `ok:1` response. +TEST_F(RewriteStateChangeErrorsTest, WriteConcernError) { + // Make an OK object, and append a `writeConcernError` subobject bearing the `ec` error. + auto wceObject = [&](ErrorCodes::Error ec) { + return BSONObjBuilder(errorObject(ErrorCodes::OK)) + .append("writeConcernError", errorObject(ec)) + .obj(); + }; + for (auto&& [in, out] : errorCodeScenarios) { + ASSERT_BSONOBJ_EQ(rewriteObj(wceObject(in)), wceObject(out)); + } +} + +// Can find and rewrite the `writeErrors` array elements in an `ok:1` response. +TEST_F(RewriteStateChangeErrorsTest, WriteErrors) { + // Make an OK object, and append a `writeErrors` subobject bearing the `ec` errors. + auto weObject = [&](std::vector ecVec) { + BSONObjBuilder bob(errorObject(ErrorCodes::OK)); + { + BSONArrayBuilder bab(bob.subarrayStart("writeErrors")); + for (ErrorCodes::Error ec : ecVec) + bab.append(errorObject(ec)); + } + return bob.obj(); + }; + for (auto&& [in, out] : errorCodeScenarios) { + ASSERT_BSONOBJ_EQ(rewriteObj(weObject({in})), weObject({out})); + } + // Now try all the errorCodeScenarios as a single array of `writeErrors`. + std::vector allIn, allOut; + for (auto&& [in, out] : errorCodeScenarios) { + allIn.push_back(in); + allOut.push_back(out); + } + ASSERT_BSONOBJ_EQ(rewriteObj(weObject(allIn)), weObject(allOut)); +} + +} // namespace +} // namespace mongo::rpc -- cgit v1.2.1