SERVER-22027 Sharding should not retry killed operations

This change introduces a different interruption code (11602) which will be used to kill operations during replication primary stepdown so the config server retry logic can differentiate them from user-killed operations.
author: Kaloian Manassiev <kaloian.manassiev@mongodb.com> 2015-12-30 17:01:13 -0500
committer: Kaloian Manassiev <kaloian.manassiev@mongodb.com> 2015-12-30 17:09:28 -0500
commit: e49a2a16fb4b337d7ed1b0ec8d766f281741d8bf (patch)
tree: 7f0cac8cb50e56819bfe2546a525076976b1e193 /src/mongo/s
parent: 715e9e1cdc618dad480a7a1a73458daf6ea9ce0f (diff)
download: mongo-e49a2a16fb4b337d7ed1b0ec8d766f281741d8bf.tar.gz
4 files changed, 13 insertions, 23 deletions
diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp
index 5943cc6e255..5686476e1a3 100644
--- a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp
+++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp
@@ -389,8 +389,6 @@ StatusWith<OpTimePair<DatabaseType>> CatalogManagerReplicaSet::_fetchDatabaseMet
 
 StatusWith<OpTimePair<CollectionType>> CatalogManagerReplicaSet::getCollection(
     OperationContext* txn, const std::string& collNs) {
-    auto configShard = grid.shardRegistry()->getShard(txn, "config");
-
     auto statusFind = _exhaustiveFindOnConfig(txn,
                                               kConfigReadSelector,
                                               NamespaceString(CollectionType::ConfigNS),
diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp
index b21ff63961f..9bf44eaa09c 100644
--- a/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp
+++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp
@@ -93,7 +93,7 @@ TEST_F(InsertRetryTest, RetryOnInterruptedAndNetworkErrorSuccess) {
     onCommand([&](const RemoteCommandRequest& request) {
         ASSERT_EQ(request.target, kTestHosts[0]);
         configTargeter()->setFindHostReturnValue({kTestHosts[1]});
-        return Status(ErrorCodes::Interrupted, "Interruption");
+        return Status(ErrorCodes::InterruptedDueToReplStateChange, "Interruption");
     });
 
     onCommand([&](const RemoteCommandRequest& request) {
@@ -271,7 +271,7 @@ TEST_F(UpdateRetryTest, OperationInterruptedDueToPrimaryStepDown) {
 
         auto writeErrDetail = stdx::make_unique<WriteErrorDetail>();
         writeErrDetail->setIndex(0);
-        writeErrDetail->setErrCode(ErrorCodes::Interrupted);
+        writeErrDetail->setErrCode(ErrorCodes::InterruptedDueToReplStateChange);
         writeErrDetail->setErrMessage("Operation interrupted");
         response.addToErrDetails(writeErrDetail.release());
 
diff --git a/src/mongo/s/client/shard_registry.cpp b/src/mongo/s/client/shard_registry.cpp
index 55121f42209..e5b91dda83a 100644
--- a/src/mongo/s/client/shard_registry.cpp
+++ b/src/mongo/s/client/shard_registry.cpp
@@ -126,6 +126,7 @@ const ShardRegistry::ErrorCodesSet ShardRegistry::kNotMasterErrors{ErrorCodes::N
 const ShardRegistry::ErrorCodesSet ShardRegistry::kAllRetriableErrors{
     ErrorCodes::NotMaster,
     ErrorCodes::NotMasterNoSlaveOk,
+    ErrorCodes::NotMasterOrSecondary,
     // If write concern failed to be satisfied on the remote server, this most probably means that
     // some of the secondary nodes were unreachable or otherwise unresponsive, so the call is safe
     // to be retried if idempotency can be guaranteed.
@@ -133,10 +134,7 @@ const ShardRegistry::ErrorCodesSet ShardRegistry::kAllRetriableErrors{
     ErrorCodes::HostUnreachable,
     ErrorCodes::HostNotFound,
     ErrorCodes::NetworkTimeout,
-    // This set includes interrupted because replica set step down kills all server operations
-    // before it closes connections so it may happen that the caller actually receives the
-    // interruption.
-    ErrorCodes::Interrupted};
+    ErrorCodes::InterruptedDueToReplStateChange};
 
 ShardRegistry::ShardRegistry(std::unique_ptr<RemoteCommandTargeterFactory> targeterFactory,
                              std::unique_ptr<executor::TaskExecutorPool> executorPool,
@@ -782,7 +780,8 @@ StatusWith<ShardRegistry::CommandResponse> ShardRegistry::_runCommandWithMetadat
 void ShardRegistry::updateReplSetMonitor(const std::shared_ptr<RemoteCommandTargeter>& targeter,
                                          const HostAndPort& remoteHost,
                                          const Status& remoteCommandStatus) {
-    if (ErrorCodes::isNotMasterError(remoteCommandStatus.code())) {
+    if (ErrorCodes::isNotMasterError(remoteCommandStatus.code()) ||
+        (remoteCommandStatus == ErrorCodes::InterruptedDueToReplStateChange)) {
         targeter->markHostNotMaster(remoteHost);
     } else if (ErrorCodes::isNetworkError(remoteCommandStatus.code())) {
         targeter->markHostUnreachable(remoteHost);
diff --git a/src/mongo/s/query/async_results_merger.cpp b/src/mongo/s/query/async_results_merger.cpp
index 8b23528a04a..c82de6a3bbe 100644
--- a/src/mongo/s/query/async_results_merger.cpp
+++ b/src/mongo/s/query/async_results_merger.cpp
@@ -51,15 +51,6 @@ namespace {
 // Maximum number of retries for network and replication notMaster errors (per host).
 const int kMaxNumFailedHostRetryAttempts = 3;
 
-/**
- * Returns whether a particular error code returned from the initial cursor establishment should
- * be retried.
- */
-bool isPerShardRetriableError(ErrorCodes::Error err) {
-    return (ShardRegistry::kAllRetriableErrors.count(err) ||
-            err == ErrorCodes::NotMasterOrSecondary);
-}
-
 }  // namespace
 
 AsyncResultsMerger::AsyncResultsMerger(executor::TaskExecutor* executor,
@@ -438,8 +429,7 @@ void AsyncResultsMerger::handleBatchResponse(
     if (!cursorResponseStatus.isOK()) {
         // Notify the shard registry of the failure.
         if (remote.shardId) {
-            // TODO: Pass down an OperationContext* to use here.
-            auto shard = grid.shardRegistry()->getShard(nullptr, *remote.shardId);
+            auto shard = grid.shardRegistry()->getShardNoReload(*remote.shardId);
             if (!shard) {
                 remote.status = Status(cursorResponseStatus.getStatus().code(),
                                        str::stream() << "Could not find shard " << *remote.shardId
@@ -453,7 +443,10 @@ void AsyncResultsMerger::handleBatchResponse(
 
         // If the error is retriable, schedule another request.
         if (!remote.cursorId && remote.retryCount < kMaxNumFailedHostRetryAttempts &&
-            isPerShardRetriableError(cursorResponseStatus.getStatus().code())) {
+            ShardRegistry::kAllRetriableErrors.count(cursorResponseStatus.getStatus().code())) {
+            LOG(1) << "Initial cursor establishment failed with retriable error and will be retried"
+                   << causedBy(cursorResponseStatus.getStatus());
+
             ++remote.retryCount;
 
             // Since we potentially updated the targeter that the last host it chose might be
@@ -641,13 +634,13 @@ Status AsyncResultsMerger::RemoteCursorData::resolveShardIdToHostAndPort(
     invariant(shardId);
     invariant(!cursorId);
 
-    // TODO: Pass down an OperationContext* to use here.
-    const auto shard = grid.shardRegistry()->getShard(nullptr, *shardId);
+    const auto shard = grid.shardRegistry()->getShardNoReload(*shardId);
     if (!shard) {
         return Status(ErrorCodes::ShardNotFound,
                       str::stream() << "Could not find shard " << *shardId);
     }
 
+    // TODO: Pass down an OperationContext* to use here.
     auto findHostStatus = shard->getTargeter()->findHost(
         readPref, RemoteCommandTargeter::selectFindHostMaxWaitTime(nullptr));
     if (!findHostStatus.isOK()) {
author	Kaloian Manassiev <kaloian.manassiev@mongodb.com>	2015-12-30 17:01:13 -0500
committer	Kaloian Manassiev <kaloian.manassiev@mongodb.com>	2015-12-30 17:09:28 -0500
commit	e49a2a16fb4b337d7ed1b0ec8d766f281741d8bf (patch)
tree	7f0cac8cb50e56819bfe2546a525076976b1e193 /src/mongo/s
parent	715e9e1cdc618dad480a7a1a73458daf6ea9ce0f (diff)
download	mongo-e49a2a16fb4b337d7ed1b0ec8d766f281741d8bf.tar.gz