diff options
author | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2015-12-30 17:01:04 -0500 |
---|---|---|
committer | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2015-12-30 17:09:27 -0500 |
commit | 715e9e1cdc618dad480a7a1a73458daf6ea9ce0f (patch) | |
tree | 95ee80f3e51d3218647bc6fb013dec7f3f735297 /src | |
parent | 5d2d6e209acd862324612c7f9c41d65940f8dcba (diff) | |
download | mongo-715e9e1cdc618dad480a7a1a73458daf6ea9ce0f.tar.gz |
Revert "SERVER-22027 Sharding should not retry killed operations"
This reverts commit 5d2d6e209acd862324612c7f9c41d65940f8dcba.
Diffstat (limited to 'src')
-rw-r--r-- | src/mongo/base/error_codes.err | 6 | ||||
-rw-r--r-- | src/mongo/db/operation_context.cpp | 9 | ||||
-rw-r--r-- | src/mongo/db/operation_context.h | 40 | ||||
-rw-r--r-- | src/mongo/db/operation_context_impl.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_external_state_impl.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/service_context.h | 2 | ||||
-rw-r--r-- | src/mongo/db/service_context_d.cpp | 38 | ||||
-rw-r--r-- | src/mongo/db/service_context_d.h | 15 | ||||
-rw-r--r-- | src/mongo/db/service_context_noop.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/service_context_noop.h | 2 | ||||
-rw-r--r-- | src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp | 2 | ||||
-rw-r--r-- | src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp | 4 | ||||
-rw-r--r-- | src/mongo/s/client/shard_registry.cpp | 9 | ||||
-rw-r--r-- | src/mongo/s/query/async_results_merger.cpp | 21 |
14 files changed, 88 insertions, 73 deletions
diff --git a/src/mongo/base/error_codes.err b/src/mongo/base/error_codes.err index 43fdefa1c50..faed0dbb14a 100644 --- a/src/mongo/base/error_codes.err +++ b/src/mongo/base/error_codes.err @@ -160,7 +160,6 @@ error_code("CannotGrowDocumentInCappedNamespace", 10003) error_code("DuplicateKey", 11000) error_code("InterruptedAtShutdown", 11600) error_code("Interrupted", 11601) -error_code("InterruptedDueToReplStateChange", 11602) error_code("OutOfDiskSpace", 14031 ) error_code("KeyTooLong", 17280); error_code("BackgroundOperationInProgressForDatabase", 12586); @@ -174,10 +173,7 @@ error_code("PrepareConfigsFailed", 13104); # SERVER-21428 explains the extra changes needed if error_class components change. error_class("NetworkError", ["HostUnreachable", "HostNotFound", "NetworkTimeout"]) -error_class("Interruption", ["Interrupted", - "InterruptedAtShutdown", - "InterruptedDueToReplStateChange", - "ExceededTimeLimit"]) +error_class("Interruption", ["Interrupted", "InterruptedAtShutdown", "ExceededTimeLimit"]) error_class("NotMasterError", ["NotMaster", "NotMasterNoSlaveOk"]) error_class("StaleShardingError", ["RecvStaleConfig", "SendStaleConfig", "StaleShardVersion", "StaleEpoch"]) diff --git a/src/mongo/db/operation_context.cpp b/src/mongo/db/operation_context.cpp index 1434ee5ddc7..1b79dadcb50 100644 --- a/src/mongo/db/operation_context.cpp +++ b/src/mongo/db/operation_context.cpp @@ -46,13 +46,12 @@ Client* OperationContext::getClient() const { return _client; } -void OperationContext::markKilled(ErrorCodes::Error killCode) { - invariant(killCode != ErrorCodes::OK); - _killCode.compareAndSwap(ErrorCodes::OK, killCode); +void OperationContext::markKilled() { + _killPending.store(1); } -ErrorCodes::Error OperationContext::getKillStatus() const { - return _killCode.loadRelaxed(); +bool OperationContext::isKillPending() const { + return _killPending.loadRelaxed(); } } // namespace mongo diff --git a/src/mongo/db/operation_context.h b/src/mongo/db/operation_context.h index 3a7e3165dda..a146a8f7f14 100644 --- a/src/mongo/db/operation_context.h +++ b/src/mongo/db/operation_context.h @@ -77,6 +77,7 @@ public: */ virtual RecoveryUnit* recoveryUnit() const = 0; + /** * Returns the RecoveryUnit (same return value as recoveryUnit()) but the caller takes * ownership of the returned RecoveryUnit, and the OperationContext instance relinquishes @@ -182,33 +183,23 @@ public: virtual bool writesAreReplicated() const = 0; /** - * Marks this operation as killed so that subsequent calls to checkForInterrupt and - * checkForInterruptNoAssert by the thread executing the operation will start returning the - * specified error code. + * Marks this operation as killed. * - * If multiple threads kill the same operation with different codes, only the first code will - * be preserved. + * Subsequent calls to checkForInterrupt and checkForInterruptNoAssert by the thread + * executing the operation will indicate that the operation has been killed. * - * May be called by any thread that has locked the Client owning this operation context. + * May be called by any thread that has locked the Client owning this operation context, + * or by the thread executing on behalf of this operation context. */ - void markKilled(ErrorCodes::Error killCode = ErrorCodes::Interrupted); + void markKilled(); /** - * Returns the code passed to markKilled if this operation context has been killed previously - * or ErrorCodes::OK otherwise. + * Returns true if markKilled has been called on this operation context. * - * May be called by any thread that has locked the Client owning this operation context, or - * without lock by the thread executing on behalf of this operation context. - */ - ErrorCodes::Error getKillStatus() const; - - /** - * Shortcut method, which checks whether getKillStatus returns a non-OK value. Has the same - * concurrency rules as getKillStatus. + * May be called by any thread that has locked the Client owning this operation context, + * or by the thread executing on behalf of this operation context. */ - bool isKillPending() const { - return getKillStatus() != ErrorCodes::OK; - } + bool isKillPending() const; protected: OperationContext(Client* client, unsigned int opId, Locker* locker); @@ -220,14 +211,11 @@ private: Client* const _client; const unsigned int _opId; - // Not owned. + // The lifetime of locker is managed by subclasses of OperationContext, so it is not + // safe to access _locker in the destructor of OperationContext. Locker* const _locker; - // Follows the values of ErrorCodes::Error. The default value is 0 (OK), which means the - // operation is not killed. If killed, it will contain a specific code. This value changes only - // once from OK to some kill code. - AtomicWord<ErrorCodes::Error> _killCode{ErrorCodes::OK}; - + AtomicInt32 _killPending{0}; WriteConcernOptions _writeConcern; }; diff --git a/src/mongo/db/operation_context_impl.cpp b/src/mongo/db/operation_context_impl.cpp index b7733958bc2..f2c0166876f 100644 --- a/src/mongo/db/operation_context_impl.cpp +++ b/src/mongo/db/operation_context_impl.cpp @@ -140,7 +140,6 @@ uint64_t OperationContextImpl::getRemainingMaxTimeMicros() const { MONGO_FP_DECLARE(checkForInterruptFail); namespace { - // Helper function for checkForInterrupt fail point. Decides whether the operation currently // being run by the given Client meet the (probabilistic) conditions for interruption as // specified in the fail point info. @@ -195,9 +194,8 @@ Status OperationContextImpl::checkForInterruptNoAssert() { } } - const auto killStatus = getKillStatus(); - if (killStatus != ErrorCodes::OK) { - return Status(killStatus, "operation was interrupted"); + if (isKillPending()) { + return Status(ErrorCodes::Interrupted, "operation was interrupted"); } return Status::OK(); diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index 52b0fee436c..c42b80a605d 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -350,8 +350,8 @@ void ReplicationCoordinatorExternalStateImpl::closeConnections() { } void ReplicationCoordinatorExternalStateImpl::killAllUserOperations(OperationContext* txn) { - ServiceContext* environment = txn->getServiceContext(); - environment->killAllUserOperations(txn, ErrorCodes::InterruptedDueToReplStateChange); + ServiceContext* environment = getGlobalServiceContext(); + environment->killAllUserOperations(txn); } void ReplicationCoordinatorExternalStateImpl::clearShardingState() { diff --git a/src/mongo/db/service_context.h b/src/mongo/db/service_context.h index 4e63046726f..6f9aa36a22f 100644 --- a/src/mongo/db/service_context.h +++ b/src/mongo/db/service_context.h @@ -284,7 +284,7 @@ public: * Kills all operations that have a Client that is associated with an incoming user * connection, except for the one associated with txn. */ - virtual void killAllUserOperations(const OperationContext* txn, ErrorCodes::Error killCode) = 0; + virtual void killAllUserOperations(const OperationContext* txn) = 0; /** * Registers a listener to be notified each time an op is killed. diff --git a/src/mongo/db/service_context_d.cpp b/src/mongo/db/service_context_d.cpp index 470dcb927a5..3e5fd2d5be7 100644 --- a/src/mongo/db/service_context_d.cpp +++ b/src/mongo/db/service_context_d.cpp @@ -220,9 +220,21 @@ bool ServiceContextMongoD::getKillAllOperations() { return _globalKill; } -void ServiceContextMongoD::_killOperation_inlock(OperationContext* opCtx, - ErrorCodes::Error killCode) { - opCtx->markKilled(killCode); +bool ServiceContextMongoD::_killOperationsAssociatedWithClientAndOpId_inlock(Client* client, + unsigned int opId) { + OperationContext* opCtx = client->getOperationContext(); + if (!opCtx) { + return false; + } + if (opCtx->getOpID() != opId) { + return false; + } + _killOperation_inlock(opCtx); + return true; +} + +void ServiceContextMongoD::_killOperation_inlock(OperationContext* opCtx) { + opCtx->markKilled(); for (const auto listener : _killOpListeners) { try { @@ -236,10 +248,8 @@ void ServiceContextMongoD::_killOperation_inlock(OperationContext* opCtx, bool ServiceContextMongoD::killOperation(unsigned int opId) { for (LockedClientsCursor cursor(this); Client* client = cursor.next();) { stdx::lock_guard<Client> lk(*client); - - OperationContext* opCtx = client->getOperationContext(); - if (opCtx && opCtx->getOpID() == opId) { - _killOperation_inlock(opCtx, ErrorCodes::Interrupted); + bool found = _killOperationsAssociatedWithClientAndOpId_inlock(client, opId); + if (found) { return true; } } @@ -247,8 +257,7 @@ bool ServiceContextMongoD::killOperation(unsigned int opId) { return false; } -void ServiceContextMongoD::killAllUserOperations(const OperationContext* txn, - ErrorCodes::Error killCode) { +void ServiceContextMongoD::killAllUserOperations(const OperationContext* txn) { for (LockedClientsCursor cursor(this); Client* client = cursor.next();) { if (!client->isFromUserConnection()) { // Don't kill system operations. @@ -257,11 +266,16 @@ void ServiceContextMongoD::killAllUserOperations(const OperationContext* txn, stdx::lock_guard<Client> lk(*client); OperationContext* toKill = client->getOperationContext(); + if (!toKill) { + continue; + } - // Don't kill ourself. - if (toKill && toKill->getOpID() != txn->getOpID()) { - _killOperation_inlock(toKill, killCode); + if (toKill->getOpID() == txn->getOpID()) { + // Don't kill ourself. + continue; } + + _killOperation_inlock(toKill); } } diff --git a/src/mongo/db/service_context_d.h b/src/mongo/db/service_context_d.h index dc834197e9e..0c560ff17f4 100644 --- a/src/mongo/db/service_context_d.h +++ b/src/mongo/db/service_context_d.h @@ -67,7 +67,7 @@ public: bool killOperation(unsigned int opId) override; - void killAllUserOperations(const OperationContext* txn, ErrorCodes::Error killCode) override; + void killAllUserOperations(const OperationContext* txn) override; void registerKillOpListener(KillOpListenerInterface* listener) override; @@ -79,11 +79,22 @@ private: std::unique_ptr<OperationContext> _newOpCtx(Client* client) override; /** + * Kills the active operation on "client" if that operation is associated with operation id + * "opId". + * + * Returns true if an operation was killed. + * + * Must only be called by a thread owning both this service context's mutex and the + * client's. + */ + bool _killOperationsAssociatedWithClientAndOpId_inlock(Client* client, unsigned int opId); + + /** * Kills the given operation. * * Caller must own the service context's _mutex. */ - void _killOperation_inlock(OperationContext* opCtx, ErrorCodes::Error killCode); + void _killOperation_inlock(OperationContext* opCtx); bool _globalKill; diff --git a/src/mongo/db/service_context_noop.cpp b/src/mongo/db/service_context_noop.cpp index fd94b35db89..65184906442 100644 --- a/src/mongo/db/service_context_noop.cpp +++ b/src/mongo/db/service_context_noop.cpp @@ -79,8 +79,7 @@ bool ServiceContextNoop::killOperation(unsigned int opId) { return false; } -void ServiceContextNoop::killAllUserOperations(const OperationContext* txn, - ErrorCodes::Error killCode) {} +void ServiceContextNoop::killAllUserOperations(const OperationContext* txn) {} void ServiceContextNoop::registerKillOpListener(KillOpListenerInterface* listener) {} diff --git a/src/mongo/db/service_context_noop.h b/src/mongo/db/service_context_noop.h index 62e6a169896..0e74b61b7a2 100644 --- a/src/mongo/db/service_context_noop.h +++ b/src/mongo/db/service_context_noop.h @@ -49,7 +49,7 @@ public: bool killOperation(unsigned int opId) override; - void killAllUserOperations(const OperationContext* txn, ErrorCodes::Error killCode) override; + void killAllUserOperations(const OperationContext* txn) override; void setKillAllOperations() override; diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp index 5686476e1a3..5943cc6e255 100644 --- a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp +++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp @@ -389,6 +389,8 @@ StatusWith<OpTimePair<DatabaseType>> CatalogManagerReplicaSet::_fetchDatabaseMet StatusWith<OpTimePair<CollectionType>> CatalogManagerReplicaSet::getCollection( OperationContext* txn, const std::string& collNs) { + auto configShard = grid.shardRegistry()->getShard(txn, "config"); + auto statusFind = _exhaustiveFindOnConfig(txn, kConfigReadSelector, NamespaceString(CollectionType::ConfigNS), diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp index 9bf44eaa09c..b21ff63961f 100644 --- a/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp +++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp @@ -93,7 +93,7 @@ TEST_F(InsertRetryTest, RetryOnInterruptedAndNetworkErrorSuccess) { onCommand([&](const RemoteCommandRequest& request) { ASSERT_EQ(request.target, kTestHosts[0]); configTargeter()->setFindHostReturnValue({kTestHosts[1]}); - return Status(ErrorCodes::InterruptedDueToReplStateChange, "Interruption"); + return Status(ErrorCodes::Interrupted, "Interruption"); }); onCommand([&](const RemoteCommandRequest& request) { @@ -271,7 +271,7 @@ TEST_F(UpdateRetryTest, OperationInterruptedDueToPrimaryStepDown) { auto writeErrDetail = stdx::make_unique<WriteErrorDetail>(); writeErrDetail->setIndex(0); - writeErrDetail->setErrCode(ErrorCodes::InterruptedDueToReplStateChange); + writeErrDetail->setErrCode(ErrorCodes::Interrupted); writeErrDetail->setErrMessage("Operation interrupted"); response.addToErrDetails(writeErrDetail.release()); diff --git a/src/mongo/s/client/shard_registry.cpp b/src/mongo/s/client/shard_registry.cpp index e5b91dda83a..55121f42209 100644 --- a/src/mongo/s/client/shard_registry.cpp +++ b/src/mongo/s/client/shard_registry.cpp @@ -126,7 +126,6 @@ const ShardRegistry::ErrorCodesSet ShardRegistry::kNotMasterErrors{ErrorCodes::N const ShardRegistry::ErrorCodesSet ShardRegistry::kAllRetriableErrors{ ErrorCodes::NotMaster, ErrorCodes::NotMasterNoSlaveOk, - ErrorCodes::NotMasterOrSecondary, // If write concern failed to be satisfied on the remote server, this most probably means that // some of the secondary nodes were unreachable or otherwise unresponsive, so the call is safe // to be retried if idempotency can be guaranteed. @@ -134,7 +133,10 @@ const ShardRegistry::ErrorCodesSet ShardRegistry::kAllRetriableErrors{ ErrorCodes::HostUnreachable, ErrorCodes::HostNotFound, ErrorCodes::NetworkTimeout, - ErrorCodes::InterruptedDueToReplStateChange}; + // This set includes interrupted because replica set step down kills all server operations + // before it closes connections so it may happen that the caller actually receives the + // interruption. + ErrorCodes::Interrupted}; ShardRegistry::ShardRegistry(std::unique_ptr<RemoteCommandTargeterFactory> targeterFactory, std::unique_ptr<executor::TaskExecutorPool> executorPool, @@ -780,8 +782,7 @@ StatusWith<ShardRegistry::CommandResponse> ShardRegistry::_runCommandWithMetadat void ShardRegistry::updateReplSetMonitor(const std::shared_ptr<RemoteCommandTargeter>& targeter, const HostAndPort& remoteHost, const Status& remoteCommandStatus) { - if (ErrorCodes::isNotMasterError(remoteCommandStatus.code()) || - (remoteCommandStatus == ErrorCodes::InterruptedDueToReplStateChange)) { + if (ErrorCodes::isNotMasterError(remoteCommandStatus.code())) { targeter->markHostNotMaster(remoteHost); } else if (ErrorCodes::isNetworkError(remoteCommandStatus.code())) { targeter->markHostUnreachable(remoteHost); diff --git a/src/mongo/s/query/async_results_merger.cpp b/src/mongo/s/query/async_results_merger.cpp index c82de6a3bbe..8b23528a04a 100644 --- a/src/mongo/s/query/async_results_merger.cpp +++ b/src/mongo/s/query/async_results_merger.cpp @@ -51,6 +51,15 @@ namespace { // Maximum number of retries for network and replication notMaster errors (per host). const int kMaxNumFailedHostRetryAttempts = 3; +/** + * Returns whether a particular error code returned from the initial cursor establishment should + * be retried. + */ +bool isPerShardRetriableError(ErrorCodes::Error err) { + return (ShardRegistry::kAllRetriableErrors.count(err) || + err == ErrorCodes::NotMasterOrSecondary); +} + } // namespace AsyncResultsMerger::AsyncResultsMerger(executor::TaskExecutor* executor, @@ -429,7 +438,8 @@ void AsyncResultsMerger::handleBatchResponse( if (!cursorResponseStatus.isOK()) { // Notify the shard registry of the failure. if (remote.shardId) { - auto shard = grid.shardRegistry()->getShardNoReload(*remote.shardId); + // TODO: Pass down an OperationContext* to use here. + auto shard = grid.shardRegistry()->getShard(nullptr, *remote.shardId); if (!shard) { remote.status = Status(cursorResponseStatus.getStatus().code(), str::stream() << "Could not find shard " << *remote.shardId @@ -443,10 +453,7 @@ void AsyncResultsMerger::handleBatchResponse( // If the error is retriable, schedule another request. if (!remote.cursorId && remote.retryCount < kMaxNumFailedHostRetryAttempts && - ShardRegistry::kAllRetriableErrors.count(cursorResponseStatus.getStatus().code())) { - LOG(1) << "Initial cursor establishment failed with retriable error and will be retried" - << causedBy(cursorResponseStatus.getStatus()); - + isPerShardRetriableError(cursorResponseStatus.getStatus().code())) { ++remote.retryCount; // Since we potentially updated the targeter that the last host it chose might be @@ -634,13 +641,13 @@ Status AsyncResultsMerger::RemoteCursorData::resolveShardIdToHostAndPort( invariant(shardId); invariant(!cursorId); - const auto shard = grid.shardRegistry()->getShardNoReload(*shardId); + // TODO: Pass down an OperationContext* to use here. + const auto shard = grid.shardRegistry()->getShard(nullptr, *shardId); if (!shard) { return Status(ErrorCodes::ShardNotFound, str::stream() << "Could not find shard " << *shardId); } - // TODO: Pass down an OperationContext* to use here. auto findHostStatus = shard->getTargeter()->findHost( readPref, RemoteCommandTargeter::selectFindHostMaxWaitTime(nullptr)); if (!findHostStatus.isOK()) { |