diff options
author | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2015-11-12 16:20:11 -0500 |
---|---|---|
committer | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2015-11-16 18:04:24 -0500 |
commit | 10872ad8758b99a266c0340b88cd81f20b89bdf7 (patch) | |
tree | b2aef171852486385a39326fd3cdfc63dbfccb9a | |
parent | bd58ea2ba5d17b960981990bb97cab133d7e90ed (diff) | |
download | mongo-10872ad8758b99a266c0340b88cd81f20b89bdf7.tar.gz |
SERVER-21392 Make commands retry on interrupted operations
9 files changed, 344 insertions, 95 deletions
diff --git a/src/mongo/s/catalog/replset/SConscript b/src/mongo/s/catalog/replset/SConscript index f4ec2adca85..ecf08b2927a 100644 --- a/src/mongo/s/catalog/replset/SConscript +++ b/src/mongo/s/catalog/replset/SConscript @@ -12,24 +12,11 @@ env.Library( '$BUILD_DIR/mongo/s/catalog/catalog_types', '$BUILD_DIR/mongo/s/catalog/dist_lock_catalog_interface', '$BUILD_DIR/mongo/s/catalog/dist_lock_manager', + '$BUILD_DIR/mongo/s/client/sharding_client', '$BUILD_DIR/mongo/util/fail_point' ], ) -env.CppUnitTest( - target='replset_dist_lock_manager_test', - source=[ - 'replset_dist_lock_manager_test.cpp', - ], - LIBDEPS=[ - '$BUILD_DIR/mongo/db/service_context', - '$BUILD_DIR/mongo/s/catalog/dist_lock_catalog_mock', - '$BUILD_DIR/mongo/util/foundation', - '$BUILD_DIR/mongo/util/tick_source_mock', - 'replset_dist_lock_manager' - ] -) - env.Library( target='dist_lock_catalog_impl', source=[ @@ -52,9 +39,10 @@ env.Library( ) env.CppUnitTest( - target='dist_lock_catalog_impl_test', + target='replset_dist_lock_manager_test', source=[ 'dist_lock_catalog_impl_test.cpp', + 'replset_dist_lock_manager_test.cpp', ], LIBDEPS=[ '$BUILD_DIR/mongo/client/remote_command_targeter_mock', @@ -62,8 +50,10 @@ env.CppUnitTest( '$BUILD_DIR/mongo/executor/network_test_env', '$BUILD_DIR/mongo/executor/thread_pool_task_executor_test_fixture', '$BUILD_DIR/mongo/s/catalog/catalog_manager_mock', + '$BUILD_DIR/mongo/s/catalog/dist_lock_catalog_mock', '$BUILD_DIR/mongo/s/coreshard', '$BUILD_DIR/mongo/s/mongoscore', + '$BUILD_DIR/mongo/util/tick_source_mock', 'dist_lock_catalog_impl', ] ) @@ -97,6 +87,7 @@ env.CppUnitTest( 'catalog_manager_replica_set_test.cpp', 'catalog_manager_replica_set_test_fixture.cpp', 'catalog_manager_replica_set_upgrade_test.cpp', + 'catalog_manager_replica_set_write_retry_test.cpp', ], LIBDEPS=[ '$BUILD_DIR/mongo/s/sharding_test_fixture', diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp index 6e53cd31590..674064e21f1 100644 --- a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp +++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp @@ -821,8 +821,8 @@ bool CatalogManagerReplicaSet::runUserManagementWriteCommand(OperationContext* t cmdToRun = modifiedCmd.obj(); } - auto response = - grid.shardRegistry()->runCommandOnConfigWithNotMasterRetries(txn, dbname, cmdToRun); + auto response = grid.shardRegistry()->runCommandOnConfigWithRetries( + txn, dbname, cmdToRun, ShardRegistry::kNotMasterErrors); if (!response.isOK()) { return Command::appendCommandStatus(*result, response.getStatus()); @@ -865,15 +865,9 @@ bool CatalogManagerReplicaSet::runUserManagementReadCommand(OperationContext* tx Status CatalogManagerReplicaSet::applyChunkOpsDeprecated(OperationContext* txn, const BSONArray& updateOps, const BSONArray& preCondition) { - ShardRegistry::ErrorCodesSet networkOrNotMasterErrors{ErrorCodes::NotMaster, - ErrorCodes::NotMasterNoSlaveOk, - ErrorCodes::HostUnreachable, - ErrorCodes::HostNotFound, - ErrorCodes::NetworkTimeout}; - BSONObj cmd = BSON("applyOps" << updateOps << "preCondition" << preCondition); auto response = grid.shardRegistry()->runCommandOnConfigWithRetries( - txn, "config", cmd, networkOrNotMasterErrors); + txn, "config", cmd, ShardRegistry::kNetworkOrNotMasterErrors); if (!response.isOK()) { return response.getStatus(); @@ -897,12 +891,21 @@ DistLockManager* CatalogManagerReplicaSet::getDistLockManager() { void CatalogManagerReplicaSet::writeConfigServerDirect(OperationContext* txn, const BatchedCommandRequest& batchRequest, BatchedCommandResponse* batchResponse) { - std::string dbname = batchRequest.getNS().db().toString(); + _runBatchWriteCommand(txn, batchRequest, batchResponse, ShardRegistry::kNotMasterErrors); +} + +void CatalogManagerReplicaSet::_runBatchWriteCommand( + OperationContext* txn, + const BatchedCommandRequest& batchRequest, + BatchedCommandResponse* batchResponse, + const ShardRegistry::ErrorCodesSet& errorsToCheck) { + const std::string dbname = batchRequest.getNS().db().toString(); invariant(dbname == "config" || dbname == "admin"); + const BSONObj cmdObj = batchRequest.toBSON(); auto response = - grid.shardRegistry()->runCommandOnConfigWithNotMasterRetries(txn, dbname, cmdObj); + grid.shardRegistry()->runCommandOnConfigWithRetries(txn, dbname, cmdObj, errorsToCheck); if (!response.isOK()) { _toBatchError(response.getStatus(), batchResponse); return; @@ -934,7 +937,7 @@ Status CatalogManagerReplicaSet::insertConfigDocument(OperationContext* txn, for (int retry = 1; retry <= kMaxWriteRetry; retry++) { BatchedCommandResponse response; - writeConfigServerDirect(txn, request, &response); + _runBatchWriteCommand(txn, request, &response, ShardRegistry::kNotMasterErrors); Status status = response.toStatus(); @@ -958,16 +961,25 @@ Status CatalogManagerReplicaSet::insertConfigDocument(OperationContext* txn, } auto existingDocs = fetchDuplicate.getValue().value; + if (existingDocs.empty()) { + return Status(ErrorCodes::DuplicateKey, + stream() << "DuplicateKey error" << causedBy(status) + << " was returned after a retry attempt, but no documents " + "were found. This means a concurrent change occurred " + "together with the retries."); + } + invariant(existingDocs.size() == 1); BSONObj existing = std::move(existingDocs.front()); if (existing.woCompare(doc) == 0) { // Documents match, so treat the operation as success - status = Status::OK(); + return Status::OK(); } } - if (ErrorCodes::isNetworkError(status.code()) && (retry < kMaxWriteRetry)) { + if (ShardRegistry::kNetworkOrNotMasterErrors.count(status.code()) && + (retry < kMaxWriteRetry)) { continue; } @@ -1031,20 +1043,10 @@ Status CatalogManagerReplicaSet::removeConfigDocuments(OperationContext* txn, request.setNS(nss); request.setWriteConcern(WriteConcernOptions::Majority); - for (int retry = 1; retry <= kMaxWriteRetry; retry++) { - BatchedCommandResponse response; - writeConfigServerDirect(txn, request, &response); - - Status status = response.toStatus(); - - if (ErrorCodes::isNetworkError(status.code()) && (retry < kMaxWriteRetry)) { - continue; - } - - return status; - } + BatchedCommandResponse response; + _runBatchWriteCommand(txn, request, &response, ShardRegistry::kNetworkOrNotMasterErrors); - MONGO_UNREACHABLE; + return response.toStatus(); } Status CatalogManagerReplicaSet::_checkDbDoesNotExist(OperationContext* txn, @@ -1132,8 +1134,8 @@ Status CatalogManagerReplicaSet::_createCappedConfigCollection(OperationContext* StringData collName, int cappedSize) { BSONObj createCmd = BSON("create" << collName << "capped" << true << "size" << cappedSize); - auto result = - grid.shardRegistry()->runCommandOnConfigWithNotMasterRetries(txn, "config", createCmd); + auto result = grid.shardRegistry()->runCommandOnConfigWithRetries( + txn, "config", createCmd, ShardRegistry::kNetworkOrNotMasterErrors); if (!result.isOK()) { return result.getStatus(); } @@ -1305,14 +1307,15 @@ StatusWith<BSONObj> CatalogManagerReplicaSet::_runReadCommand( OperationContext* txn, const std::string& dbname, const BSONObj& cmdObj, - const ReadPreferenceSetting& settings) { + const ReadPreferenceSetting& readPref) { for (int retry = 1; retry <= kMaxReadRetry; ++retry) { - auto response = grid.shardRegistry()->runCommandOnConfig(txn, settings, dbname, cmdObj); + auto response = grid.shardRegistry()->runCommandOnConfig(txn, readPref, dbname, cmdObj); if (response.isOK()) { return response; } - if (ErrorCodes::isNetworkError(response.getStatus().code()) && retry < kMaxReadRetry) { + if (ShardRegistry::kNetworkOrNotMasterErrors.count(response.getStatus().code()) && + retry < kMaxReadRetry) { continue; } diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set.h b/src/mongo/s/catalog/replset/catalog_manager_replica_set.h index c2145dc90f1..93a8dc06ca7 100644 --- a/src/mongo/s/catalog/replset/catalog_manager_replica_set.h +++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set.h @@ -31,6 +31,7 @@ #include "mongo/client/connection_string.h" #include "mongo/db/repl/optime.h" #include "mongo/s/catalog/catalog_manager_common.h" +#include "mongo/s/client/shard_registry.h" #include "mongo/stdx/mutex.h" namespace mongo { @@ -171,7 +172,16 @@ private: StatusWith<BSONObj> _runReadCommand(OperationContext* txn, const std::string& dbname, const BSONObj& cmdObj, - const ReadPreferenceSetting& settings); + const ReadPreferenceSetting& readPref); + + /** + * Executes the specified batch write command on the current config server's primary and retries + * on the specified set of errors using the default retry policy. + */ + void _runBatchWriteCommand(OperationContext* txn, + const BatchedCommandRequest& request, + BatchedCommandResponse* response, + const ShardRegistry::ErrorCodesSet& errorsToCheck); /** * Helper method for running a count command against the config server with appropriate diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set_log_change_test.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set_log_change_test.cpp index 582e888520a..ee82dce9bd8 100644 --- a/src/mongo/s/catalog/replset/catalog_manager_replica_set_log_change_test.cpp +++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set_log_change_test.cpp @@ -142,7 +142,7 @@ protected: BSONObjBuilder createResponseBuilder; Command::appendCommandStatus(createResponseBuilder, - Status(ErrorCodes::HostUnreachable, "socket error")); + Status(ErrorCodes::ExceededTimeLimit, "operation timed out")); expectConfigCollectionCreate( configHost, getConfigCollName(), _cappedSize, createResponseBuilder.obj()); diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp new file mode 100644 index 00000000000..287c10a5c00 --- /dev/null +++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp @@ -0,0 +1,246 @@ +/** + * Copyright (C) 2015 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kSharding + +#include "mongo/platform/basic.h" + +#include <set> +#include <string> +#include <vector> + +#include "mongo/client/remote_command_targeter_mock.h" +#include "mongo/db/client.h" +#include "mongo/db/commands.h" +#include "mongo/executor/network_interface_mock.h" +#include "mongo/executor/task_executor.h" +#include "mongo/rpc/metadata/repl_set_metadata.h" +#include "mongo/rpc/metadata/server_selection_metadata.h" +#include "mongo/s/catalog/dist_lock_manager_mock.h" +#include "mongo/s/catalog/replset/catalog_manager_replica_set.h" +#include "mongo/s/catalog/replset/catalog_manager_replica_set_test_fixture.h" +#include "mongo/s/catalog/type_changelog.h" +#include "mongo/s/catalog/type_chunk.h" +#include "mongo/s/catalog/type_collection.h" +#include "mongo/s/catalog/type_database.h" +#include "mongo/s/catalog/type_shard.h" +#include "mongo/s/chunk.h" +#include "mongo/s/client/shard_registry.h" +#include "mongo/s/grid.h" +#include "mongo/stdx/chrono.h" +#include "mongo/stdx/future.h" +#include "mongo/util/log.h" + +namespace mongo { +namespace { + +using executor::NetworkInterfaceMock; +using executor::RemoteCommandRequest; +using executor::RemoteCommandResponse; +using executor::TaskExecutor; +using std::set; +using std::string; +using std::vector; +using stdx::chrono::milliseconds; +using unittest::assertGet; + +using InsertRetryTest = CatalogManagerReplSetTestFixture; + +const NamespaceString kTestNamespace("config.TestColl"); +const HostAndPort kTestHosts[] = { + HostAndPort("TestHost1:12345"), HostAndPort("TestHost2:12345"), HostAndPort("TestHost3:12345")}; + +TEST_F(InsertRetryTest, RetryOnInterruptedAndNetworkErrorSuccess) { + configTargeter()->setFindHostReturnValue({kTestHosts[0]}); + + BSONObj objToInsert = BSON("_id" << 1 << "Value" + << "TestValue"); + + auto future = launchAsync([&] { + Status status = catalogManager()->insertConfigDocument( + operationContext(), kTestNamespace.ns(), objToInsert); + ASSERT_OK(status); + }); + + onCommand([&](const RemoteCommandRequest& request) { + ASSERT_EQ(request.target, kTestHosts[0]); + configTargeter()->setFindHostReturnValue({kTestHosts[1]}); + return Status(ErrorCodes::Interrupted, "Interruption"); + }); + + onCommand([&](const RemoteCommandRequest& request) { + ASSERT_EQ(request.target, kTestHosts[1]); + configTargeter()->setFindHostReturnValue({kTestHosts[2]}); + return Status(ErrorCodes::NetworkTimeout, "Network timeout"); + }); + + expectInserts(kTestNamespace, {objToInsert}); + + future.timed_get(kFutureTimeout); +} + +TEST_F(InsertRetryTest, RetryOnNetworkErrorFails) { + configTargeter()->setFindHostReturnValue({kTestHosts[0]}); + + BSONObj objToInsert = BSON("_id" << 1 << "Value" + << "TestValue"); + + auto future = launchAsync([&] { + Status status = catalogManager()->insertConfigDocument( + operationContext(), kTestNamespace.ns(), objToInsert); + ASSERT_EQ(ErrorCodes::NetworkTimeout, status.code()); + }); + + onCommand([&](const RemoteCommandRequest& request) { + ASSERT_EQ(request.target, kTestHosts[0]); + configTargeter()->setFindHostReturnValue({kTestHosts[1]}); + return Status(ErrorCodes::NetworkTimeout, "Network timeout"); + }); + + onCommand([&](const RemoteCommandRequest& request) { + ASSERT_EQ(request.target, kTestHosts[1]); + configTargeter()->setFindHostReturnValue({kTestHosts[2]}); + return Status(ErrorCodes::NetworkTimeout, "Network timeout"); + }); + + onCommand([&](const RemoteCommandRequest& request) { + ASSERT_EQ(request.target, kTestHosts[2]); + return Status(ErrorCodes::NetworkTimeout, "Network timeout"); + }); + + future.timed_get(kFutureTimeout); +} + +TEST_F(InsertRetryTest, DuplicateKeyErrorAfterNetworkErrorMatch) { + configTargeter()->setFindHostReturnValue({kTestHosts[0]}); + + BSONObj objToInsert = BSON("_id" << 1 << "Value" + << "TestValue"); + + auto future = launchAsync([&] { + Status status = catalogManager()->insertConfigDocument( + operationContext(), kTestNamespace.ns(), objToInsert); + ASSERT_OK(status); + }); + + onCommand([&](const RemoteCommandRequest& request) { + ASSERT_EQ(request.target, kTestHosts[0]); + configTargeter()->setFindHostReturnValue({kTestHosts[1]}); + return Status(ErrorCodes::NetworkTimeout, "Network timeout"); + }); + + onCommand([&](const RemoteCommandRequest& request) { + ASSERT_EQ(request.target, kTestHosts[1]); + configTargeter()->setFindHostReturnValue({kTestHosts[2]}); + return Status(ErrorCodes::DuplicateKey, "Duplicate key"); + }); + + onFindCommand([&](const RemoteCommandRequest& request) { + auto query = + assertGet(LiteParsedQuery::makeFromFindCommand(kTestNamespace, request.cmdObj, false)); + ASSERT_EQ(BSON("_id" << 1), query->getFilter()); + + return vector<BSONObj>{objToInsert}; + }); + + future.timed_get(kFutureTimeout); +} + +TEST_F(InsertRetryTest, DuplicateKeyErrorAfterNetworkErrorNotFound) { + configTargeter()->setFindHostReturnValue({kTestHosts[0]}); + + BSONObj objToInsert = BSON("_id" << 1 << "Value" + << "TestValue"); + + auto future = launchAsync([&] { + Status status = catalogManager()->insertConfigDocument( + operationContext(), kTestNamespace.ns(), objToInsert); + ASSERT_EQ(ErrorCodes::DuplicateKey, status.code()); + }); + + onCommand([&](const RemoteCommandRequest& request) { + ASSERT_EQ(request.target, kTestHosts[0]); + configTargeter()->setFindHostReturnValue({kTestHosts[1]}); + return Status(ErrorCodes::NetworkTimeout, "Network timeout"); + }); + + onCommand([&](const RemoteCommandRequest& request) { + ASSERT_EQ(request.target, kTestHosts[1]); + configTargeter()->setFindHostReturnValue({kTestHosts[2]}); + return Status(ErrorCodes::DuplicateKey, "Duplicate key"); + }); + + onFindCommand([&](const RemoteCommandRequest& request) { + auto query = + assertGet(LiteParsedQuery::makeFromFindCommand(kTestNamespace, request.cmdObj, false)); + ASSERT_EQ(BSON("_id" << 1), query->getFilter()); + + return vector<BSONObj>(); + }); + + future.timed_get(kFutureTimeout); +} + +TEST_F(InsertRetryTest, DuplicateKeyErrorAfterNetworkErrorMismatch) { + configTargeter()->setFindHostReturnValue({kTestHosts[0]}); + + BSONObj objToInsert = BSON("_id" << 1 << "Value" + << "TestValue"); + + auto future = launchAsync([&] { + Status status = catalogManager()->insertConfigDocument( + operationContext(), kTestNamespace.ns(), objToInsert); + ASSERT_EQ(ErrorCodes::DuplicateKey, status.code()); + }); + + onCommand([&](const RemoteCommandRequest& request) { + ASSERT_EQ(request.target, kTestHosts[0]); + configTargeter()->setFindHostReturnValue({kTestHosts[1]}); + return Status(ErrorCodes::NetworkTimeout, "Network timeout"); + }); + + onCommand([&](const RemoteCommandRequest& request) { + ASSERT_EQ(request.target, kTestHosts[1]); + configTargeter()->setFindHostReturnValue({kTestHosts[2]}); + return Status(ErrorCodes::DuplicateKey, "Duplicate key"); + }); + + onFindCommand([&](const RemoteCommandRequest& request) { + auto query = + assertGet(LiteParsedQuery::makeFromFindCommand(kTestNamespace, request.cmdObj, false)); + ASSERT_EQ(BSON("_id" << 1), query->getFilter()); + + return vector<BSONObj>{BSON("_id" << 1 << "Value" + << "TestValue has changed")}; + }); + + future.timed_get(kFutureTimeout); +} + +} // namespace +} // namespace mongo diff --git a/src/mongo/s/catalog/replset/dist_lock_catalog_impl.cpp b/src/mongo/s/catalog/replset/dist_lock_catalog_impl.cpp index 74d6246ae65..87ce4f74bb4 100644 --- a/src/mongo/s/catalog/replset/dist_lock_catalog_impl.cpp +++ b/src/mongo/s/catalog/replset/dist_lock_catalog_impl.cpp @@ -188,8 +188,8 @@ Status DistLockCatalogImpl::ping(OperationContext* txn, StringData processID, Da request.setUpsert(true); request.setWriteConcern(_writeConcern); - auto resultStatus = _client->runCommandOnConfigWithNotMasterRetries( - txn, _locksNS.db().toString(), request.toBSON()); + auto resultStatus = _client->runCommandOnConfigWithRetries( + txn, _locksNS.db().toString(), request.toBSON(), ShardRegistry::kNotMasterErrors); if (!resultStatus.isOK()) { return resultStatus.getStatus(); @@ -220,8 +220,8 @@ StatusWith<LocksType> DistLockCatalogImpl::grabLock(OperationContext* txn, request.setShouldReturnNew(true); request.setWriteConcern(_writeConcern); - auto resultStatus = _client->runCommandOnConfigWithNotMasterRetries( - txn, _locksNS.db().toString(), request.toBSON()); + auto resultStatus = _client->runCommandOnConfigWithRetries( + txn, _locksNS.db().toString(), request.toBSON(), ShardRegistry::kNotMasterErrors); if (!resultStatus.isOK()) { return resultStatus.getStatus(); @@ -274,8 +274,8 @@ StatusWith<LocksType> DistLockCatalogImpl::overtakeLock(OperationContext* txn, request.setShouldReturnNew(true); request.setWriteConcern(_writeConcern); - auto resultStatus = _client->runCommandOnConfigWithNotMasterRetries( - txn, _locksNS.db().toString(), request.toBSON()); + auto resultStatus = _client->runCommandOnConfigWithRetries( + txn, _locksNS.db().toString(), request.toBSON(), ShardRegistry::kNotMasterErrors); if (!resultStatus.isOK()) { return resultStatus.getStatus(); @@ -306,8 +306,8 @@ Status DistLockCatalogImpl::unlock(OperationContext* txn, const OID& lockSession BSON("$set" << BSON(LocksType::state(LocksType::UNLOCKED)))); request.setWriteConcern(_writeConcern); - auto resultStatus = _client->runCommandOnConfigWithNotMasterRetries( - txn, _locksNS.db().toString(), request.toBSON()); + auto resultStatus = _client->runCommandOnConfigWithRetries( + txn, _locksNS.db().toString(), request.toBSON(), ShardRegistry::kNotMasterErrors); if (!resultStatus.isOK()) { return resultStatus.getStatus(); @@ -418,8 +418,8 @@ Status DistLockCatalogImpl::stopPing(OperationContext* txn, StringData processId FindAndModifyRequest::makeRemove(_lockPingNS, BSON(LockpingsType::process() << processId)); request.setWriteConcern(_writeConcern); - auto resultStatus = _client->runCommandOnConfigWithNotMasterRetries( - txn, _locksNS.db().toString(), request.toBSON()); + auto resultStatus = _client->runCommandOnConfigWithRetries( + txn, _locksNS.db().toString(), request.toBSON(), ShardRegistry::kNotMasterErrors); if (!resultStatus.isOK()) { return resultStatus.getStatus(); diff --git a/src/mongo/s/client/shard_registry.cpp b/src/mongo/s/client/shard_registry.cpp index 181e5fcd84a..1bad7249647 100644 --- a/src/mongo/s/client/shard_registry.cpp +++ b/src/mongo/s/client/shard_registry.cpp @@ -85,9 +85,6 @@ const BSONObj kReplSecondaryOkMetadata{[] { return o.obj(); }()}; -ShardRegistry::ErrorCodesSet kNotMasterErrors{ErrorCodes::NotMaster, - ErrorCodes::NotMasterNoSlaveOk}; - BSONObj appendMaxTimeToCmdObj(long long maxTimeMicros, const BSONObj& cmdObj) { Seconds maxTime = kConfigCommandTimeout; @@ -123,6 +120,19 @@ BSONObj appendMaxTimeToCmdObj(long long maxTimeMicros, const BSONObj& cmdObj) { } // unnamed namespace +const ShardRegistry::ErrorCodesSet ShardRegistry::kNotMasterErrors{ErrorCodes::NotMaster, + ErrorCodes::NotMasterNoSlaveOk}; +const ShardRegistry::ErrorCodesSet ShardRegistry::kNetworkOrNotMasterErrors{ + ErrorCodes::NotMaster, + ErrorCodes::NotMasterNoSlaveOk, + ErrorCodes::HostUnreachable, + ErrorCodes::HostNotFound, + ErrorCodes::NetworkTimeout, + // This set includes interrupted because replica set step down kills all server operations + // before it closes connections so it may happen that the caller actually receives the + // interruption. + ErrorCodes::Interrupted}; + ShardRegistry::ShardRegistry(std::unique_ptr<RemoteCommandTargeterFactory> targeterFactory, std::unique_ptr<executor::TaskExecutorPool> executorPool, executor::NetworkInterface* network, @@ -520,8 +530,7 @@ StatusWith<ShardRegistry::QueryResponse> ShardRegistry::exhaustiveFindOnConfig( return result; } - if ((ErrorCodes::isNetworkError(result.getStatus().code()) || - ErrorCodes::isNotMasterError(result.getStatus().code())) && + if (kNetworkOrNotMasterErrors.count(result.getStatus().code()) && retry < kOnErrorNumRetries) { continue; } @@ -611,25 +620,6 @@ StatusWith<BSONObj> ShardRegistry::runCommandOnConfig(OperationContext* txn, return response.getValue().response; } -StatusWith<BSONObj> ShardRegistry::runCommandOnConfigWithNotMasterRetries(OperationContext* txn, - const std::string& dbname, - const BSONObj& cmdObj) { - auto response = _runCommandWithRetries(txn, - _executorPool->getFixedExecutor(), - getConfigShard(), - ReadPreferenceSetting{ReadPreference::PrimaryOnly}, - dbname, - cmdObj, - kReplMetadata, - kNotMasterErrors); - if (!response.isOK()) { - return response.getStatus(); - } - - advanceConfigOpTime(response.getValue().visibleOpTime); - return response.getValue().response; -} - StatusWith<BSONObj> ShardRegistry::runCommandWithNotMasterRetries(OperationContext* txn, const ShardId& shardId, const std::string& dbname, diff --git a/src/mongo/s/client/shard_registry.h b/src/mongo/s/client/shard_registry.h index 3f83a49f764..c99a496883b 100644 --- a/src/mongo/s/client/shard_registry.h +++ b/src/mongo/s/client/shard_registry.h @@ -278,26 +278,23 @@ public: const std::string& dbname, const BSONObj& cmdObj); - StatusWith<BSONObj> runCommandOnConfigWithNotMasterRetries(OperationContext* txn, - const std::string& dbname, - const BSONObj& cmdObj); - class ErrorCodesHash { public: size_t operator()(ErrorCodes::Error e) const { return std::hash<typename std::underlying_type<ErrorCodes::Error>::type>()(e); } }; + using ErrorCodesSet = unordered_set<ErrorCodes::Error, ErrorCodesHash>; /** - * Runs commands against a config shard. Retries if executing the command fails with one - * of the given error codes, or if executing the command succeeds but the command - * failed with one of the codes. If executing the command fails with a different - * code we return that code. If executing the command succeeds and the command - * itself succeeds or fails with a code not in the set, then we return the command response - * object. Thus the caller is responsible for checking the command response object for any kind - * of command-specific failures other than those specified in errorsToCheck. + * Runs commands against the config shard's primary. Retries if executing the command fails with + * one of the given error codes, or if executing the command succeeds but the server returned + * one of the codes. If executing the command fails with a different code we return that code. + * If executing the command succeeds and the command itself succeeds or fails with a code not in + * the set, then we return the command response object. Thus the caller is responsible for + * checking the command response object for any kind of command-specific failures other than + * those specified in errorsToCheck. */ StatusWith<BSONObj> runCommandOnConfigWithRetries(OperationContext* txn, const std::string& dbname, @@ -312,6 +309,18 @@ public: const HostAndPort& remoteHost, const Status& remoteCommandStatus); + /** + * Set of error codes, which indicate that the remote host is not the current master. Retries on + * errors from this set are always safe and should be used by default. + */ + static const ErrorCodesSet kNotMasterErrors; + + /** + * Set of error codes which includes NotMaster and any network exceptions. Retries on errors + * from this set are not always safe and may require some additional idempotency guarantees. + */ + static const ErrorCodesSet kNetworkOrNotMasterErrors; + private: using ShardMap = std::unordered_map<ShardId, std::shared_ptr<Shard>>; diff --git a/src/mongo/s/query/async_results_merger.cpp b/src/mongo/s/query/async_results_merger.cpp index 7ac1a1a5319..8f14d2bc03f 100644 --- a/src/mongo/s/query/async_results_merger.cpp +++ b/src/mongo/s/query/async_results_merger.cpp @@ -56,7 +56,7 @@ const int kMaxNumFailedHostRetryAttempts = 3; * be retried. */ bool isPerShardRetriableError(ErrorCodes::Error err) { - return (ErrorCodes::isNetworkError(err) || ErrorCodes::isNotMasterError(err) || + return (ShardRegistry::kNetworkOrNotMasterErrors.count(err) || err == ErrorCodes::NotMasterOrSecondary); } |