summaryrefslogtreecommitdiff
path: root/src/mongo/s
diff options
context:
space:
mode:
authorKaloian Manassiev <kaloian.manassiev@mongodb.com>2015-12-30 17:01:13 -0500
committerKaloian Manassiev <kaloian.manassiev@mongodb.com>2015-12-30 17:09:28 -0500
commite49a2a16fb4b337d7ed1b0ec8d766f281741d8bf (patch)
tree7f0cac8cb50e56819bfe2546a525076976b1e193 /src/mongo/s
parent715e9e1cdc618dad480a7a1a73458daf6ea9ce0f (diff)
downloadmongo-e49a2a16fb4b337d7ed1b0ec8d766f281741d8bf.tar.gz
SERVER-22027 Sharding should not retry killed operations
This change introduces a different interruption code (11602) which will be used to kill operations during replication primary stepdown so the config server retry logic can differentiate them from user-killed operations.
Diffstat (limited to 'src/mongo/s')
-rw-r--r--src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp2
-rw-r--r--src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp4
-rw-r--r--src/mongo/s/client/shard_registry.cpp9
-rw-r--r--src/mongo/s/query/async_results_merger.cpp21
4 files changed, 13 insertions, 23 deletions
diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp
index 5943cc6e255..5686476e1a3 100644
--- a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp
+++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp
@@ -389,8 +389,6 @@ StatusWith<OpTimePair<DatabaseType>> CatalogManagerReplicaSet::_fetchDatabaseMet
StatusWith<OpTimePair<CollectionType>> CatalogManagerReplicaSet::getCollection(
OperationContext* txn, const std::string& collNs) {
- auto configShard = grid.shardRegistry()->getShard(txn, "config");
-
auto statusFind = _exhaustiveFindOnConfig(txn,
kConfigReadSelector,
NamespaceString(CollectionType::ConfigNS),
diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp
index b21ff63961f..9bf44eaa09c 100644
--- a/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp
+++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp
@@ -93,7 +93,7 @@ TEST_F(InsertRetryTest, RetryOnInterruptedAndNetworkErrorSuccess) {
onCommand([&](const RemoteCommandRequest& request) {
ASSERT_EQ(request.target, kTestHosts[0]);
configTargeter()->setFindHostReturnValue({kTestHosts[1]});
- return Status(ErrorCodes::Interrupted, "Interruption");
+ return Status(ErrorCodes::InterruptedDueToReplStateChange, "Interruption");
});
onCommand([&](const RemoteCommandRequest& request) {
@@ -271,7 +271,7 @@ TEST_F(UpdateRetryTest, OperationInterruptedDueToPrimaryStepDown) {
auto writeErrDetail = stdx::make_unique<WriteErrorDetail>();
writeErrDetail->setIndex(0);
- writeErrDetail->setErrCode(ErrorCodes::Interrupted);
+ writeErrDetail->setErrCode(ErrorCodes::InterruptedDueToReplStateChange);
writeErrDetail->setErrMessage("Operation interrupted");
response.addToErrDetails(writeErrDetail.release());
diff --git a/src/mongo/s/client/shard_registry.cpp b/src/mongo/s/client/shard_registry.cpp
index 55121f42209..e5b91dda83a 100644
--- a/src/mongo/s/client/shard_registry.cpp
+++ b/src/mongo/s/client/shard_registry.cpp
@@ -126,6 +126,7 @@ const ShardRegistry::ErrorCodesSet ShardRegistry::kNotMasterErrors{ErrorCodes::N
const ShardRegistry::ErrorCodesSet ShardRegistry::kAllRetriableErrors{
ErrorCodes::NotMaster,
ErrorCodes::NotMasterNoSlaveOk,
+ ErrorCodes::NotMasterOrSecondary,
// If write concern failed to be satisfied on the remote server, this most probably means that
// some of the secondary nodes were unreachable or otherwise unresponsive, so the call is safe
// to be retried if idempotency can be guaranteed.
@@ -133,10 +134,7 @@ const ShardRegistry::ErrorCodesSet ShardRegistry::kAllRetriableErrors{
ErrorCodes::HostUnreachable,
ErrorCodes::HostNotFound,
ErrorCodes::NetworkTimeout,
- // This set includes interrupted because replica set step down kills all server operations
- // before it closes connections so it may happen that the caller actually receives the
- // interruption.
- ErrorCodes::Interrupted};
+ ErrorCodes::InterruptedDueToReplStateChange};
ShardRegistry::ShardRegistry(std::unique_ptr<RemoteCommandTargeterFactory> targeterFactory,
std::unique_ptr<executor::TaskExecutorPool> executorPool,
@@ -782,7 +780,8 @@ StatusWith<ShardRegistry::CommandResponse> ShardRegistry::_runCommandWithMetadat
void ShardRegistry::updateReplSetMonitor(const std::shared_ptr<RemoteCommandTargeter>& targeter,
const HostAndPort& remoteHost,
const Status& remoteCommandStatus) {
- if (ErrorCodes::isNotMasterError(remoteCommandStatus.code())) {
+ if (ErrorCodes::isNotMasterError(remoteCommandStatus.code()) ||
+ (remoteCommandStatus == ErrorCodes::InterruptedDueToReplStateChange)) {
targeter->markHostNotMaster(remoteHost);
} else if (ErrorCodes::isNetworkError(remoteCommandStatus.code())) {
targeter->markHostUnreachable(remoteHost);
diff --git a/src/mongo/s/query/async_results_merger.cpp b/src/mongo/s/query/async_results_merger.cpp
index 8b23528a04a..c82de6a3bbe 100644
--- a/src/mongo/s/query/async_results_merger.cpp
+++ b/src/mongo/s/query/async_results_merger.cpp
@@ -51,15 +51,6 @@ namespace {
// Maximum number of retries for network and replication notMaster errors (per host).
const int kMaxNumFailedHostRetryAttempts = 3;
-/**
- * Returns whether a particular error code returned from the initial cursor establishment should
- * be retried.
- */
-bool isPerShardRetriableError(ErrorCodes::Error err) {
- return (ShardRegistry::kAllRetriableErrors.count(err) ||
- err == ErrorCodes::NotMasterOrSecondary);
-}
-
} // namespace
AsyncResultsMerger::AsyncResultsMerger(executor::TaskExecutor* executor,
@@ -438,8 +429,7 @@ void AsyncResultsMerger::handleBatchResponse(
if (!cursorResponseStatus.isOK()) {
// Notify the shard registry of the failure.
if (remote.shardId) {
- // TODO: Pass down an OperationContext* to use here.
- auto shard = grid.shardRegistry()->getShard(nullptr, *remote.shardId);
+ auto shard = grid.shardRegistry()->getShardNoReload(*remote.shardId);
if (!shard) {
remote.status = Status(cursorResponseStatus.getStatus().code(),
str::stream() << "Could not find shard " << *remote.shardId
@@ -453,7 +443,10 @@ void AsyncResultsMerger::handleBatchResponse(
// If the error is retriable, schedule another request.
if (!remote.cursorId && remote.retryCount < kMaxNumFailedHostRetryAttempts &&
- isPerShardRetriableError(cursorResponseStatus.getStatus().code())) {
+ ShardRegistry::kAllRetriableErrors.count(cursorResponseStatus.getStatus().code())) {
+ LOG(1) << "Initial cursor establishment failed with retriable error and will be retried"
+ << causedBy(cursorResponseStatus.getStatus());
+
++remote.retryCount;
// Since we potentially updated the targeter that the last host it chose might be
@@ -641,13 +634,13 @@ Status AsyncResultsMerger::RemoteCursorData::resolveShardIdToHostAndPort(
invariant(shardId);
invariant(!cursorId);
- // TODO: Pass down an OperationContext* to use here.
- const auto shard = grid.shardRegistry()->getShard(nullptr, *shardId);
+ const auto shard = grid.shardRegistry()->getShardNoReload(*shardId);
if (!shard) {
return Status(ErrorCodes::ShardNotFound,
str::stream() << "Could not find shard " << *shardId);
}
+ // TODO: Pass down an OperationContext* to use here.
auto findHostStatus = shard->getTargeter()->findHost(
readPref, RemoteCommandTargeter::selectFindHostMaxWaitTime(nullptr));
if (!findHostStatus.isOK()) {