summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKaloian Manassiev <kaloian.manassiev@mongodb.com>2015-11-12 16:20:11 -0500
committerKaloian Manassiev <kaloian.manassiev@mongodb.com>2015-11-16 18:04:24 -0500
commit10872ad8758b99a266c0340b88cd81f20b89bdf7 (patch)
treeb2aef171852486385a39326fd3cdfc63dbfccb9a
parentbd58ea2ba5d17b960981990bb97cab133d7e90ed (diff)
downloadmongo-10872ad8758b99a266c0340b88cd81f20b89bdf7.tar.gz
SERVER-21392 Make commands retry on interrupted operations
-rw-r--r--src/mongo/s/catalog/replset/SConscript21
-rw-r--r--src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp67
-rw-r--r--src/mongo/s/catalog/replset/catalog_manager_replica_set.h12
-rw-r--r--src/mongo/s/catalog/replset/catalog_manager_replica_set_log_change_test.cpp2
-rw-r--r--src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp246
-rw-r--r--src/mongo/s/catalog/replset/dist_lock_catalog_impl.cpp20
-rw-r--r--src/mongo/s/client/shard_registry.cpp38
-rw-r--r--src/mongo/s/client/shard_registry.h31
-rw-r--r--src/mongo/s/query/async_results_merger.cpp2
9 files changed, 344 insertions, 95 deletions
diff --git a/src/mongo/s/catalog/replset/SConscript b/src/mongo/s/catalog/replset/SConscript
index f4ec2adca85..ecf08b2927a 100644
--- a/src/mongo/s/catalog/replset/SConscript
+++ b/src/mongo/s/catalog/replset/SConscript
@@ -12,24 +12,11 @@ env.Library(
'$BUILD_DIR/mongo/s/catalog/catalog_types',
'$BUILD_DIR/mongo/s/catalog/dist_lock_catalog_interface',
'$BUILD_DIR/mongo/s/catalog/dist_lock_manager',
+ '$BUILD_DIR/mongo/s/client/sharding_client',
'$BUILD_DIR/mongo/util/fail_point'
],
)
-env.CppUnitTest(
- target='replset_dist_lock_manager_test',
- source=[
- 'replset_dist_lock_manager_test.cpp',
- ],
- LIBDEPS=[
- '$BUILD_DIR/mongo/db/service_context',
- '$BUILD_DIR/mongo/s/catalog/dist_lock_catalog_mock',
- '$BUILD_DIR/mongo/util/foundation',
- '$BUILD_DIR/mongo/util/tick_source_mock',
- 'replset_dist_lock_manager'
- ]
-)
-
env.Library(
target='dist_lock_catalog_impl',
source=[
@@ -52,9 +39,10 @@ env.Library(
)
env.CppUnitTest(
- target='dist_lock_catalog_impl_test',
+ target='replset_dist_lock_manager_test',
source=[
'dist_lock_catalog_impl_test.cpp',
+ 'replset_dist_lock_manager_test.cpp',
],
LIBDEPS=[
'$BUILD_DIR/mongo/client/remote_command_targeter_mock',
@@ -62,8 +50,10 @@ env.CppUnitTest(
'$BUILD_DIR/mongo/executor/network_test_env',
'$BUILD_DIR/mongo/executor/thread_pool_task_executor_test_fixture',
'$BUILD_DIR/mongo/s/catalog/catalog_manager_mock',
+ '$BUILD_DIR/mongo/s/catalog/dist_lock_catalog_mock',
'$BUILD_DIR/mongo/s/coreshard',
'$BUILD_DIR/mongo/s/mongoscore',
+ '$BUILD_DIR/mongo/util/tick_source_mock',
'dist_lock_catalog_impl',
]
)
@@ -97,6 +87,7 @@ env.CppUnitTest(
'catalog_manager_replica_set_test.cpp',
'catalog_manager_replica_set_test_fixture.cpp',
'catalog_manager_replica_set_upgrade_test.cpp',
+ 'catalog_manager_replica_set_write_retry_test.cpp',
],
LIBDEPS=[
'$BUILD_DIR/mongo/s/sharding_test_fixture',
diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp
index 6e53cd31590..674064e21f1 100644
--- a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp
+++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp
@@ -821,8 +821,8 @@ bool CatalogManagerReplicaSet::runUserManagementWriteCommand(OperationContext* t
cmdToRun = modifiedCmd.obj();
}
- auto response =
- grid.shardRegistry()->runCommandOnConfigWithNotMasterRetries(txn, dbname, cmdToRun);
+ auto response = grid.shardRegistry()->runCommandOnConfigWithRetries(
+ txn, dbname, cmdToRun, ShardRegistry::kNotMasterErrors);
if (!response.isOK()) {
return Command::appendCommandStatus(*result, response.getStatus());
@@ -865,15 +865,9 @@ bool CatalogManagerReplicaSet::runUserManagementReadCommand(OperationContext* tx
Status CatalogManagerReplicaSet::applyChunkOpsDeprecated(OperationContext* txn,
const BSONArray& updateOps,
const BSONArray& preCondition) {
- ShardRegistry::ErrorCodesSet networkOrNotMasterErrors{ErrorCodes::NotMaster,
- ErrorCodes::NotMasterNoSlaveOk,
- ErrorCodes::HostUnreachable,
- ErrorCodes::HostNotFound,
- ErrorCodes::NetworkTimeout};
-
BSONObj cmd = BSON("applyOps" << updateOps << "preCondition" << preCondition);
auto response = grid.shardRegistry()->runCommandOnConfigWithRetries(
- txn, "config", cmd, networkOrNotMasterErrors);
+ txn, "config", cmd, ShardRegistry::kNetworkOrNotMasterErrors);
if (!response.isOK()) {
return response.getStatus();
@@ -897,12 +891,21 @@ DistLockManager* CatalogManagerReplicaSet::getDistLockManager() {
void CatalogManagerReplicaSet::writeConfigServerDirect(OperationContext* txn,
const BatchedCommandRequest& batchRequest,
BatchedCommandResponse* batchResponse) {
- std::string dbname = batchRequest.getNS().db().toString();
+ _runBatchWriteCommand(txn, batchRequest, batchResponse, ShardRegistry::kNotMasterErrors);
+}
+
+void CatalogManagerReplicaSet::_runBatchWriteCommand(
+ OperationContext* txn,
+ const BatchedCommandRequest& batchRequest,
+ BatchedCommandResponse* batchResponse,
+ const ShardRegistry::ErrorCodesSet& errorsToCheck) {
+ const std::string dbname = batchRequest.getNS().db().toString();
invariant(dbname == "config" || dbname == "admin");
+
const BSONObj cmdObj = batchRequest.toBSON();
auto response =
- grid.shardRegistry()->runCommandOnConfigWithNotMasterRetries(txn, dbname, cmdObj);
+ grid.shardRegistry()->runCommandOnConfigWithRetries(txn, dbname, cmdObj, errorsToCheck);
if (!response.isOK()) {
_toBatchError(response.getStatus(), batchResponse);
return;
@@ -934,7 +937,7 @@ Status CatalogManagerReplicaSet::insertConfigDocument(OperationContext* txn,
for (int retry = 1; retry <= kMaxWriteRetry; retry++) {
BatchedCommandResponse response;
- writeConfigServerDirect(txn, request, &response);
+ _runBatchWriteCommand(txn, request, &response, ShardRegistry::kNotMasterErrors);
Status status = response.toStatus();
@@ -958,16 +961,25 @@ Status CatalogManagerReplicaSet::insertConfigDocument(OperationContext* txn,
}
auto existingDocs = fetchDuplicate.getValue().value;
+ if (existingDocs.empty()) {
+ return Status(ErrorCodes::DuplicateKey,
+ stream() << "DuplicateKey error" << causedBy(status)
+ << " was returned after a retry attempt, but no documents "
+ "were found. This means a concurrent change occurred "
+ "together with the retries.");
+ }
+
invariant(existingDocs.size() == 1);
BSONObj existing = std::move(existingDocs.front());
if (existing.woCompare(doc) == 0) {
// Documents match, so treat the operation as success
- status = Status::OK();
+ return Status::OK();
}
}
- if (ErrorCodes::isNetworkError(status.code()) && (retry < kMaxWriteRetry)) {
+ if (ShardRegistry::kNetworkOrNotMasterErrors.count(status.code()) &&
+ (retry < kMaxWriteRetry)) {
continue;
}
@@ -1031,20 +1043,10 @@ Status CatalogManagerReplicaSet::removeConfigDocuments(OperationContext* txn,
request.setNS(nss);
request.setWriteConcern(WriteConcernOptions::Majority);
- for (int retry = 1; retry <= kMaxWriteRetry; retry++) {
- BatchedCommandResponse response;
- writeConfigServerDirect(txn, request, &response);
-
- Status status = response.toStatus();
-
- if (ErrorCodes::isNetworkError(status.code()) && (retry < kMaxWriteRetry)) {
- continue;
- }
-
- return status;
- }
+ BatchedCommandResponse response;
+ _runBatchWriteCommand(txn, request, &response, ShardRegistry::kNetworkOrNotMasterErrors);
- MONGO_UNREACHABLE;
+ return response.toStatus();
}
Status CatalogManagerReplicaSet::_checkDbDoesNotExist(OperationContext* txn,
@@ -1132,8 +1134,8 @@ Status CatalogManagerReplicaSet::_createCappedConfigCollection(OperationContext*
StringData collName,
int cappedSize) {
BSONObj createCmd = BSON("create" << collName << "capped" << true << "size" << cappedSize);
- auto result =
- grid.shardRegistry()->runCommandOnConfigWithNotMasterRetries(txn, "config", createCmd);
+ auto result = grid.shardRegistry()->runCommandOnConfigWithRetries(
+ txn, "config", createCmd, ShardRegistry::kNetworkOrNotMasterErrors);
if (!result.isOK()) {
return result.getStatus();
}
@@ -1305,14 +1307,15 @@ StatusWith<BSONObj> CatalogManagerReplicaSet::_runReadCommand(
OperationContext* txn,
const std::string& dbname,
const BSONObj& cmdObj,
- const ReadPreferenceSetting& settings) {
+ const ReadPreferenceSetting& readPref) {
for (int retry = 1; retry <= kMaxReadRetry; ++retry) {
- auto response = grid.shardRegistry()->runCommandOnConfig(txn, settings, dbname, cmdObj);
+ auto response = grid.shardRegistry()->runCommandOnConfig(txn, readPref, dbname, cmdObj);
if (response.isOK()) {
return response;
}
- if (ErrorCodes::isNetworkError(response.getStatus().code()) && retry < kMaxReadRetry) {
+ if (ShardRegistry::kNetworkOrNotMasterErrors.count(response.getStatus().code()) &&
+ retry < kMaxReadRetry) {
continue;
}
diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set.h b/src/mongo/s/catalog/replset/catalog_manager_replica_set.h
index c2145dc90f1..93a8dc06ca7 100644
--- a/src/mongo/s/catalog/replset/catalog_manager_replica_set.h
+++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set.h
@@ -31,6 +31,7 @@
#include "mongo/client/connection_string.h"
#include "mongo/db/repl/optime.h"
#include "mongo/s/catalog/catalog_manager_common.h"
+#include "mongo/s/client/shard_registry.h"
#include "mongo/stdx/mutex.h"
namespace mongo {
@@ -171,7 +172,16 @@ private:
StatusWith<BSONObj> _runReadCommand(OperationContext* txn,
const std::string& dbname,
const BSONObj& cmdObj,
- const ReadPreferenceSetting& settings);
+ const ReadPreferenceSetting& readPref);
+
+ /**
+ * Executes the specified batch write command on the current config server's primary and retries
+ * on the specified set of errors using the default retry policy.
+ */
+ void _runBatchWriteCommand(OperationContext* txn,
+ const BatchedCommandRequest& request,
+ BatchedCommandResponse* response,
+ const ShardRegistry::ErrorCodesSet& errorsToCheck);
/**
* Helper method for running a count command against the config server with appropriate
diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set_log_change_test.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set_log_change_test.cpp
index 582e888520a..ee82dce9bd8 100644
--- a/src/mongo/s/catalog/replset/catalog_manager_replica_set_log_change_test.cpp
+++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set_log_change_test.cpp
@@ -142,7 +142,7 @@ protected:
BSONObjBuilder createResponseBuilder;
Command::appendCommandStatus(createResponseBuilder,
- Status(ErrorCodes::HostUnreachable, "socket error"));
+ Status(ErrorCodes::ExceededTimeLimit, "operation timed out"));
expectConfigCollectionCreate(
configHost, getConfigCollName(), _cappedSize, createResponseBuilder.obj());
diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp
new file mode 100644
index 00000000000..287c10a5c00
--- /dev/null
+++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp
@@ -0,0 +1,246 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kSharding
+
+#include "mongo/platform/basic.h"
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include "mongo/client/remote_command_targeter_mock.h"
+#include "mongo/db/client.h"
+#include "mongo/db/commands.h"
+#include "mongo/executor/network_interface_mock.h"
+#include "mongo/executor/task_executor.h"
+#include "mongo/rpc/metadata/repl_set_metadata.h"
+#include "mongo/rpc/metadata/server_selection_metadata.h"
+#include "mongo/s/catalog/dist_lock_manager_mock.h"
+#include "mongo/s/catalog/replset/catalog_manager_replica_set.h"
+#include "mongo/s/catalog/replset/catalog_manager_replica_set_test_fixture.h"
+#include "mongo/s/catalog/type_changelog.h"
+#include "mongo/s/catalog/type_chunk.h"
+#include "mongo/s/catalog/type_collection.h"
+#include "mongo/s/catalog/type_database.h"
+#include "mongo/s/catalog/type_shard.h"
+#include "mongo/s/chunk.h"
+#include "mongo/s/client/shard_registry.h"
+#include "mongo/s/grid.h"
+#include "mongo/stdx/chrono.h"
+#include "mongo/stdx/future.h"
+#include "mongo/util/log.h"
+
+namespace mongo {
+namespace {
+
+using executor::NetworkInterfaceMock;
+using executor::RemoteCommandRequest;
+using executor::RemoteCommandResponse;
+using executor::TaskExecutor;
+using std::set;
+using std::string;
+using std::vector;
+using stdx::chrono::milliseconds;
+using unittest::assertGet;
+
+using InsertRetryTest = CatalogManagerReplSetTestFixture;
+
+const NamespaceString kTestNamespace("config.TestColl");
+const HostAndPort kTestHosts[] = {
+ HostAndPort("TestHost1:12345"), HostAndPort("TestHost2:12345"), HostAndPort("TestHost3:12345")};
+
+TEST_F(InsertRetryTest, RetryOnInterruptedAndNetworkErrorSuccess) {
+ configTargeter()->setFindHostReturnValue({kTestHosts[0]});
+
+ BSONObj objToInsert = BSON("_id" << 1 << "Value"
+ << "TestValue");
+
+ auto future = launchAsync([&] {
+ Status status = catalogManager()->insertConfigDocument(
+ operationContext(), kTestNamespace.ns(), objToInsert);
+ ASSERT_OK(status);
+ });
+
+ onCommand([&](const RemoteCommandRequest& request) {
+ ASSERT_EQ(request.target, kTestHosts[0]);
+ configTargeter()->setFindHostReturnValue({kTestHosts[1]});
+ return Status(ErrorCodes::Interrupted, "Interruption");
+ });
+
+ onCommand([&](const RemoteCommandRequest& request) {
+ ASSERT_EQ(request.target, kTestHosts[1]);
+ configTargeter()->setFindHostReturnValue({kTestHosts[2]});
+ return Status(ErrorCodes::NetworkTimeout, "Network timeout");
+ });
+
+ expectInserts(kTestNamespace, {objToInsert});
+
+ future.timed_get(kFutureTimeout);
+}
+
+TEST_F(InsertRetryTest, RetryOnNetworkErrorFails) {
+ configTargeter()->setFindHostReturnValue({kTestHosts[0]});
+
+ BSONObj objToInsert = BSON("_id" << 1 << "Value"
+ << "TestValue");
+
+ auto future = launchAsync([&] {
+ Status status = catalogManager()->insertConfigDocument(
+ operationContext(), kTestNamespace.ns(), objToInsert);
+ ASSERT_EQ(ErrorCodes::NetworkTimeout, status.code());
+ });
+
+ onCommand([&](const RemoteCommandRequest& request) {
+ ASSERT_EQ(request.target, kTestHosts[0]);
+ configTargeter()->setFindHostReturnValue({kTestHosts[1]});
+ return Status(ErrorCodes::NetworkTimeout, "Network timeout");
+ });
+
+ onCommand([&](const RemoteCommandRequest& request) {
+ ASSERT_EQ(request.target, kTestHosts[1]);
+ configTargeter()->setFindHostReturnValue({kTestHosts[2]});
+ return Status(ErrorCodes::NetworkTimeout, "Network timeout");
+ });
+
+ onCommand([&](const RemoteCommandRequest& request) {
+ ASSERT_EQ(request.target, kTestHosts[2]);
+ return Status(ErrorCodes::NetworkTimeout, "Network timeout");
+ });
+
+ future.timed_get(kFutureTimeout);
+}
+
+TEST_F(InsertRetryTest, DuplicateKeyErrorAfterNetworkErrorMatch) {
+ configTargeter()->setFindHostReturnValue({kTestHosts[0]});
+
+ BSONObj objToInsert = BSON("_id" << 1 << "Value"
+ << "TestValue");
+
+ auto future = launchAsync([&] {
+ Status status = catalogManager()->insertConfigDocument(
+ operationContext(), kTestNamespace.ns(), objToInsert);
+ ASSERT_OK(status);
+ });
+
+ onCommand([&](const RemoteCommandRequest& request) {
+ ASSERT_EQ(request.target, kTestHosts[0]);
+ configTargeter()->setFindHostReturnValue({kTestHosts[1]});
+ return Status(ErrorCodes::NetworkTimeout, "Network timeout");
+ });
+
+ onCommand([&](const RemoteCommandRequest& request) {
+ ASSERT_EQ(request.target, kTestHosts[1]);
+ configTargeter()->setFindHostReturnValue({kTestHosts[2]});
+ return Status(ErrorCodes::DuplicateKey, "Duplicate key");
+ });
+
+ onFindCommand([&](const RemoteCommandRequest& request) {
+ auto query =
+ assertGet(LiteParsedQuery::makeFromFindCommand(kTestNamespace, request.cmdObj, false));
+ ASSERT_EQ(BSON("_id" << 1), query->getFilter());
+
+ return vector<BSONObj>{objToInsert};
+ });
+
+ future.timed_get(kFutureTimeout);
+}
+
+TEST_F(InsertRetryTest, DuplicateKeyErrorAfterNetworkErrorNotFound) {
+ configTargeter()->setFindHostReturnValue({kTestHosts[0]});
+
+ BSONObj objToInsert = BSON("_id" << 1 << "Value"
+ << "TestValue");
+
+ auto future = launchAsync([&] {
+ Status status = catalogManager()->insertConfigDocument(
+ operationContext(), kTestNamespace.ns(), objToInsert);
+ ASSERT_EQ(ErrorCodes::DuplicateKey, status.code());
+ });
+
+ onCommand([&](const RemoteCommandRequest& request) {
+ ASSERT_EQ(request.target, kTestHosts[0]);
+ configTargeter()->setFindHostReturnValue({kTestHosts[1]});
+ return Status(ErrorCodes::NetworkTimeout, "Network timeout");
+ });
+
+ onCommand([&](const RemoteCommandRequest& request) {
+ ASSERT_EQ(request.target, kTestHosts[1]);
+ configTargeter()->setFindHostReturnValue({kTestHosts[2]});
+ return Status(ErrorCodes::DuplicateKey, "Duplicate key");
+ });
+
+ onFindCommand([&](const RemoteCommandRequest& request) {
+ auto query =
+ assertGet(LiteParsedQuery::makeFromFindCommand(kTestNamespace, request.cmdObj, false));
+ ASSERT_EQ(BSON("_id" << 1), query->getFilter());
+
+ return vector<BSONObj>();
+ });
+
+ future.timed_get(kFutureTimeout);
+}
+
+TEST_F(InsertRetryTest, DuplicateKeyErrorAfterNetworkErrorMismatch) {
+ configTargeter()->setFindHostReturnValue({kTestHosts[0]});
+
+ BSONObj objToInsert = BSON("_id" << 1 << "Value"
+ << "TestValue");
+
+ auto future = launchAsync([&] {
+ Status status = catalogManager()->insertConfigDocument(
+ operationContext(), kTestNamespace.ns(), objToInsert);
+ ASSERT_EQ(ErrorCodes::DuplicateKey, status.code());
+ });
+
+ onCommand([&](const RemoteCommandRequest& request) {
+ ASSERT_EQ(request.target, kTestHosts[0]);
+ configTargeter()->setFindHostReturnValue({kTestHosts[1]});
+ return Status(ErrorCodes::NetworkTimeout, "Network timeout");
+ });
+
+ onCommand([&](const RemoteCommandRequest& request) {
+ ASSERT_EQ(request.target, kTestHosts[1]);
+ configTargeter()->setFindHostReturnValue({kTestHosts[2]});
+ return Status(ErrorCodes::DuplicateKey, "Duplicate key");
+ });
+
+ onFindCommand([&](const RemoteCommandRequest& request) {
+ auto query =
+ assertGet(LiteParsedQuery::makeFromFindCommand(kTestNamespace, request.cmdObj, false));
+ ASSERT_EQ(BSON("_id" << 1), query->getFilter());
+
+ return vector<BSONObj>{BSON("_id" << 1 << "Value"
+ << "TestValue has changed")};
+ });
+
+ future.timed_get(kFutureTimeout);
+}
+
+} // namespace
+} // namespace mongo
diff --git a/src/mongo/s/catalog/replset/dist_lock_catalog_impl.cpp b/src/mongo/s/catalog/replset/dist_lock_catalog_impl.cpp
index 74d6246ae65..87ce4f74bb4 100644
--- a/src/mongo/s/catalog/replset/dist_lock_catalog_impl.cpp
+++ b/src/mongo/s/catalog/replset/dist_lock_catalog_impl.cpp
@@ -188,8 +188,8 @@ Status DistLockCatalogImpl::ping(OperationContext* txn, StringData processID, Da
request.setUpsert(true);
request.setWriteConcern(_writeConcern);
- auto resultStatus = _client->runCommandOnConfigWithNotMasterRetries(
- txn, _locksNS.db().toString(), request.toBSON());
+ auto resultStatus = _client->runCommandOnConfigWithRetries(
+ txn, _locksNS.db().toString(), request.toBSON(), ShardRegistry::kNotMasterErrors);
if (!resultStatus.isOK()) {
return resultStatus.getStatus();
@@ -220,8 +220,8 @@ StatusWith<LocksType> DistLockCatalogImpl::grabLock(OperationContext* txn,
request.setShouldReturnNew(true);
request.setWriteConcern(_writeConcern);
- auto resultStatus = _client->runCommandOnConfigWithNotMasterRetries(
- txn, _locksNS.db().toString(), request.toBSON());
+ auto resultStatus = _client->runCommandOnConfigWithRetries(
+ txn, _locksNS.db().toString(), request.toBSON(), ShardRegistry::kNotMasterErrors);
if (!resultStatus.isOK()) {
return resultStatus.getStatus();
@@ -274,8 +274,8 @@ StatusWith<LocksType> DistLockCatalogImpl::overtakeLock(OperationContext* txn,
request.setShouldReturnNew(true);
request.setWriteConcern(_writeConcern);
- auto resultStatus = _client->runCommandOnConfigWithNotMasterRetries(
- txn, _locksNS.db().toString(), request.toBSON());
+ auto resultStatus = _client->runCommandOnConfigWithRetries(
+ txn, _locksNS.db().toString(), request.toBSON(), ShardRegistry::kNotMasterErrors);
if (!resultStatus.isOK()) {
return resultStatus.getStatus();
@@ -306,8 +306,8 @@ Status DistLockCatalogImpl::unlock(OperationContext* txn, const OID& lockSession
BSON("$set" << BSON(LocksType::state(LocksType::UNLOCKED))));
request.setWriteConcern(_writeConcern);
- auto resultStatus = _client->runCommandOnConfigWithNotMasterRetries(
- txn, _locksNS.db().toString(), request.toBSON());
+ auto resultStatus = _client->runCommandOnConfigWithRetries(
+ txn, _locksNS.db().toString(), request.toBSON(), ShardRegistry::kNotMasterErrors);
if (!resultStatus.isOK()) {
return resultStatus.getStatus();
@@ -418,8 +418,8 @@ Status DistLockCatalogImpl::stopPing(OperationContext* txn, StringData processId
FindAndModifyRequest::makeRemove(_lockPingNS, BSON(LockpingsType::process() << processId));
request.setWriteConcern(_writeConcern);
- auto resultStatus = _client->runCommandOnConfigWithNotMasterRetries(
- txn, _locksNS.db().toString(), request.toBSON());
+ auto resultStatus = _client->runCommandOnConfigWithRetries(
+ txn, _locksNS.db().toString(), request.toBSON(), ShardRegistry::kNotMasterErrors);
if (!resultStatus.isOK()) {
return resultStatus.getStatus();
diff --git a/src/mongo/s/client/shard_registry.cpp b/src/mongo/s/client/shard_registry.cpp
index 181e5fcd84a..1bad7249647 100644
--- a/src/mongo/s/client/shard_registry.cpp
+++ b/src/mongo/s/client/shard_registry.cpp
@@ -85,9 +85,6 @@ const BSONObj kReplSecondaryOkMetadata{[] {
return o.obj();
}()};
-ShardRegistry::ErrorCodesSet kNotMasterErrors{ErrorCodes::NotMaster,
- ErrorCodes::NotMasterNoSlaveOk};
-
BSONObj appendMaxTimeToCmdObj(long long maxTimeMicros, const BSONObj& cmdObj) {
Seconds maxTime = kConfigCommandTimeout;
@@ -123,6 +120,19 @@ BSONObj appendMaxTimeToCmdObj(long long maxTimeMicros, const BSONObj& cmdObj) {
} // unnamed namespace
+const ShardRegistry::ErrorCodesSet ShardRegistry::kNotMasterErrors{ErrorCodes::NotMaster,
+ ErrorCodes::NotMasterNoSlaveOk};
+const ShardRegistry::ErrorCodesSet ShardRegistry::kNetworkOrNotMasterErrors{
+ ErrorCodes::NotMaster,
+ ErrorCodes::NotMasterNoSlaveOk,
+ ErrorCodes::HostUnreachable,
+ ErrorCodes::HostNotFound,
+ ErrorCodes::NetworkTimeout,
+ // This set includes interrupted because replica set step down kills all server operations
+ // before it closes connections so it may happen that the caller actually receives the
+ // interruption.
+ ErrorCodes::Interrupted};
+
ShardRegistry::ShardRegistry(std::unique_ptr<RemoteCommandTargeterFactory> targeterFactory,
std::unique_ptr<executor::TaskExecutorPool> executorPool,
executor::NetworkInterface* network,
@@ -520,8 +530,7 @@ StatusWith<ShardRegistry::QueryResponse> ShardRegistry::exhaustiveFindOnConfig(
return result;
}
- if ((ErrorCodes::isNetworkError(result.getStatus().code()) ||
- ErrorCodes::isNotMasterError(result.getStatus().code())) &&
+ if (kNetworkOrNotMasterErrors.count(result.getStatus().code()) &&
retry < kOnErrorNumRetries) {
continue;
}
@@ -611,25 +620,6 @@ StatusWith<BSONObj> ShardRegistry::runCommandOnConfig(OperationContext* txn,
return response.getValue().response;
}
-StatusWith<BSONObj> ShardRegistry::runCommandOnConfigWithNotMasterRetries(OperationContext* txn,
- const std::string& dbname,
- const BSONObj& cmdObj) {
- auto response = _runCommandWithRetries(txn,
- _executorPool->getFixedExecutor(),
- getConfigShard(),
- ReadPreferenceSetting{ReadPreference::PrimaryOnly},
- dbname,
- cmdObj,
- kReplMetadata,
- kNotMasterErrors);
- if (!response.isOK()) {
- return response.getStatus();
- }
-
- advanceConfigOpTime(response.getValue().visibleOpTime);
- return response.getValue().response;
-}
-
StatusWith<BSONObj> ShardRegistry::runCommandWithNotMasterRetries(OperationContext* txn,
const ShardId& shardId,
const std::string& dbname,
diff --git a/src/mongo/s/client/shard_registry.h b/src/mongo/s/client/shard_registry.h
index 3f83a49f764..c99a496883b 100644
--- a/src/mongo/s/client/shard_registry.h
+++ b/src/mongo/s/client/shard_registry.h
@@ -278,26 +278,23 @@ public:
const std::string& dbname,
const BSONObj& cmdObj);
- StatusWith<BSONObj> runCommandOnConfigWithNotMasterRetries(OperationContext* txn,
- const std::string& dbname,
- const BSONObj& cmdObj);
-
class ErrorCodesHash {
public:
size_t operator()(ErrorCodes::Error e) const {
return std::hash<typename std::underlying_type<ErrorCodes::Error>::type>()(e);
}
};
+
using ErrorCodesSet = unordered_set<ErrorCodes::Error, ErrorCodesHash>;
/**
- * Runs commands against a config shard. Retries if executing the command fails with one
- * of the given error codes, or if executing the command succeeds but the command
- * failed with one of the codes. If executing the command fails with a different
- * code we return that code. If executing the command succeeds and the command
- * itself succeeds or fails with a code not in the set, then we return the command response
- * object. Thus the caller is responsible for checking the command response object for any kind
- * of command-specific failures other than those specified in errorsToCheck.
+ * Runs commands against the config shard's primary. Retries if executing the command fails with
+ * one of the given error codes, or if executing the command succeeds but the server returned
+ * one of the codes. If executing the command fails with a different code we return that code.
+ * If executing the command succeeds and the command itself succeeds or fails with a code not in
+ * the set, then we return the command response object. Thus the caller is responsible for
+ * checking the command response object for any kind of command-specific failures other than
+ * those specified in errorsToCheck.
*/
StatusWith<BSONObj> runCommandOnConfigWithRetries(OperationContext* txn,
const std::string& dbname,
@@ -312,6 +309,18 @@ public:
const HostAndPort& remoteHost,
const Status& remoteCommandStatus);
+ /**
+ * Set of error codes, which indicate that the remote host is not the current master. Retries on
+ * errors from this set are always safe and should be used by default.
+ */
+ static const ErrorCodesSet kNotMasterErrors;
+
+ /**
+ * Set of error codes which includes NotMaster and any network exceptions. Retries on errors
+ * from this set are not always safe and may require some additional idempotency guarantees.
+ */
+ static const ErrorCodesSet kNetworkOrNotMasterErrors;
+
private:
using ShardMap = std::unordered_map<ShardId, std::shared_ptr<Shard>>;
diff --git a/src/mongo/s/query/async_results_merger.cpp b/src/mongo/s/query/async_results_merger.cpp
index 7ac1a1a5319..8f14d2bc03f 100644
--- a/src/mongo/s/query/async_results_merger.cpp
+++ b/src/mongo/s/query/async_results_merger.cpp
@@ -56,7 +56,7 @@ const int kMaxNumFailedHostRetryAttempts = 3;
* be retried.
*/
bool isPerShardRetriableError(ErrorCodes::Error err) {
- return (ErrorCodes::isNetworkError(err) || ErrorCodes::isNotMasterError(err) ||
+ return (ShardRegistry::kNetworkOrNotMasterErrors.count(err) ||
err == ErrorCodes::NotMasterOrSecondary);
}