diff options
author | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2015-12-30 17:01:13 -0500 |
---|---|---|
committer | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2015-12-30 17:09:28 -0500 |
commit | e49a2a16fb4b337d7ed1b0ec8d766f281741d8bf (patch) | |
tree | 7f0cac8cb50e56819bfe2546a525076976b1e193 /src/mongo/s | |
parent | 715e9e1cdc618dad480a7a1a73458daf6ea9ce0f (diff) | |
download | mongo-e49a2a16fb4b337d7ed1b0ec8d766f281741d8bf.tar.gz |
SERVER-22027 Sharding should not retry killed operations
This change introduces a different interruption code (11602) which will be
used to kill operations during replication primary stepdown so the config
server retry logic can differentiate them from user-killed operations.
Diffstat (limited to 'src/mongo/s')
4 files changed, 13 insertions, 23 deletions
diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp index 5943cc6e255..5686476e1a3 100644 --- a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp +++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp @@ -389,8 +389,6 @@ StatusWith<OpTimePair<DatabaseType>> CatalogManagerReplicaSet::_fetchDatabaseMet StatusWith<OpTimePair<CollectionType>> CatalogManagerReplicaSet::getCollection( OperationContext* txn, const std::string& collNs) { - auto configShard = grid.shardRegistry()->getShard(txn, "config"); - auto statusFind = _exhaustiveFindOnConfig(txn, kConfigReadSelector, NamespaceString(CollectionType::ConfigNS), diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp index b21ff63961f..9bf44eaa09c 100644 --- a/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp +++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp @@ -93,7 +93,7 @@ TEST_F(InsertRetryTest, RetryOnInterruptedAndNetworkErrorSuccess) { onCommand([&](const RemoteCommandRequest& request) { ASSERT_EQ(request.target, kTestHosts[0]); configTargeter()->setFindHostReturnValue({kTestHosts[1]}); - return Status(ErrorCodes::Interrupted, "Interruption"); + return Status(ErrorCodes::InterruptedDueToReplStateChange, "Interruption"); }); onCommand([&](const RemoteCommandRequest& request) { @@ -271,7 +271,7 @@ TEST_F(UpdateRetryTest, OperationInterruptedDueToPrimaryStepDown) { auto writeErrDetail = stdx::make_unique<WriteErrorDetail>(); writeErrDetail->setIndex(0); - writeErrDetail->setErrCode(ErrorCodes::Interrupted); + writeErrDetail->setErrCode(ErrorCodes::InterruptedDueToReplStateChange); writeErrDetail->setErrMessage("Operation interrupted"); response.addToErrDetails(writeErrDetail.release()); diff --git a/src/mongo/s/client/shard_registry.cpp b/src/mongo/s/client/shard_registry.cpp index 55121f42209..e5b91dda83a 100644 --- a/src/mongo/s/client/shard_registry.cpp +++ b/src/mongo/s/client/shard_registry.cpp @@ -126,6 +126,7 @@ const ShardRegistry::ErrorCodesSet ShardRegistry::kNotMasterErrors{ErrorCodes::N const ShardRegistry::ErrorCodesSet ShardRegistry::kAllRetriableErrors{ ErrorCodes::NotMaster, ErrorCodes::NotMasterNoSlaveOk, + ErrorCodes::NotMasterOrSecondary, // If write concern failed to be satisfied on the remote server, this most probably means that // some of the secondary nodes were unreachable or otherwise unresponsive, so the call is safe // to be retried if idempotency can be guaranteed. @@ -133,10 +134,7 @@ const ShardRegistry::ErrorCodesSet ShardRegistry::kAllRetriableErrors{ ErrorCodes::HostUnreachable, ErrorCodes::HostNotFound, ErrorCodes::NetworkTimeout, - // This set includes interrupted because replica set step down kills all server operations - // before it closes connections so it may happen that the caller actually receives the - // interruption. - ErrorCodes::Interrupted}; + ErrorCodes::InterruptedDueToReplStateChange}; ShardRegistry::ShardRegistry(std::unique_ptr<RemoteCommandTargeterFactory> targeterFactory, std::unique_ptr<executor::TaskExecutorPool> executorPool, @@ -782,7 +780,8 @@ StatusWith<ShardRegistry::CommandResponse> ShardRegistry::_runCommandWithMetadat void ShardRegistry::updateReplSetMonitor(const std::shared_ptr<RemoteCommandTargeter>& targeter, const HostAndPort& remoteHost, const Status& remoteCommandStatus) { - if (ErrorCodes::isNotMasterError(remoteCommandStatus.code())) { + if (ErrorCodes::isNotMasterError(remoteCommandStatus.code()) || + (remoteCommandStatus == ErrorCodes::InterruptedDueToReplStateChange)) { targeter->markHostNotMaster(remoteHost); } else if (ErrorCodes::isNetworkError(remoteCommandStatus.code())) { targeter->markHostUnreachable(remoteHost); diff --git a/src/mongo/s/query/async_results_merger.cpp b/src/mongo/s/query/async_results_merger.cpp index 8b23528a04a..c82de6a3bbe 100644 --- a/src/mongo/s/query/async_results_merger.cpp +++ b/src/mongo/s/query/async_results_merger.cpp @@ -51,15 +51,6 @@ namespace { // Maximum number of retries for network and replication notMaster errors (per host). const int kMaxNumFailedHostRetryAttempts = 3; -/** - * Returns whether a particular error code returned from the initial cursor establishment should - * be retried. - */ -bool isPerShardRetriableError(ErrorCodes::Error err) { - return (ShardRegistry::kAllRetriableErrors.count(err) || - err == ErrorCodes::NotMasterOrSecondary); -} - } // namespace AsyncResultsMerger::AsyncResultsMerger(executor::TaskExecutor* executor, @@ -438,8 +429,7 @@ void AsyncResultsMerger::handleBatchResponse( if (!cursorResponseStatus.isOK()) { // Notify the shard registry of the failure. if (remote.shardId) { - // TODO: Pass down an OperationContext* to use here. - auto shard = grid.shardRegistry()->getShard(nullptr, *remote.shardId); + auto shard = grid.shardRegistry()->getShardNoReload(*remote.shardId); if (!shard) { remote.status = Status(cursorResponseStatus.getStatus().code(), str::stream() << "Could not find shard " << *remote.shardId @@ -453,7 +443,10 @@ void AsyncResultsMerger::handleBatchResponse( // If the error is retriable, schedule another request. if (!remote.cursorId && remote.retryCount < kMaxNumFailedHostRetryAttempts && - isPerShardRetriableError(cursorResponseStatus.getStatus().code())) { + ShardRegistry::kAllRetriableErrors.count(cursorResponseStatus.getStatus().code())) { + LOG(1) << "Initial cursor establishment failed with retriable error and will be retried" + << causedBy(cursorResponseStatus.getStatus()); + ++remote.retryCount; // Since we potentially updated the targeter that the last host it chose might be @@ -641,13 +634,13 @@ Status AsyncResultsMerger::RemoteCursorData::resolveShardIdToHostAndPort( invariant(shardId); invariant(!cursorId); - // TODO: Pass down an OperationContext* to use here. - const auto shard = grid.shardRegistry()->getShard(nullptr, *shardId); + const auto shard = grid.shardRegistry()->getShardNoReload(*shardId); if (!shard) { return Status(ErrorCodes::ShardNotFound, str::stream() << "Could not find shard " << *shardId); } + // TODO: Pass down an OperationContext* to use here. auto findHostStatus = shard->getTargeter()->findHost( readPref, RemoteCommandTargeter::selectFindHostMaxWaitTime(nullptr)); if (!findHostStatus.isOK()) { |