diff options
author | Esha Maharishi <esha.maharishi@mongodb.com> | 2018-08-27 14:06:28 -0400 |
---|---|---|
committer | Esha Maharishi <esha.maharishi@mongodb.com> | 2018-08-27 16:21:38 -0400 |
commit | 2aede7ad2fce2616e4140f2ae398e4e570c84703 (patch) | |
tree | dd8e6f6722608f88504b3cd992e7f1c9bef93b42 | |
parent | 0fe87e465305a82419d49b71bb9618701222065b (diff) | |
download | mongo-2aede7ad2fce2616e4140f2ae398e4e570c84703.tar.gz |
SERVER-36755 CollectionLock acquisition in shard_filtering_metadata_refresh.cpp can cause server to terminate on stepdown (1/2)
5 files changed, 29 insertions, 21 deletions
diff --git a/src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp b/src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp index 80e1efd2c3d..55c5bfecbcf 100644 --- a/src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp +++ b/src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp @@ -72,8 +72,8 @@ ScopedOperationCompletionShardingActions::~ScopedOperationCompletionShardingActi } if (auto staleInfo = status->extraInfo<StaleConfigInfo>()) { - auto handleMismatchStatus = - onShardVersionMismatch(_opCtx, staleInfo->getNss(), staleInfo->getVersionReceived()); + auto handleMismatchStatus = onShardVersionMismatchNoExcept( + _opCtx, staleInfo->getNss(), staleInfo->getVersionReceived()); if (!handleMismatchStatus.isOK()) log() << "Failed to handle stale version exception" << causedBy(redact(handleMismatchStatus)); diff --git a/src/mongo/db/s/set_shard_version_command.cpp b/src/mongo/db/s/set_shard_version_command.cpp index 0876fc56eb5..e14feab3f7a 100644 --- a/src/mongo/db/s/set_shard_version_command.cpp +++ b/src/mongo/db/s/set_shard_version_command.cpp @@ -342,7 +342,7 @@ public: // Note: The forceRefresh flag controls whether we make sure to do our // own refresh or if we're okay with joining another thread - const auto status = onShardVersionMismatch( + const auto status = onShardVersionMismatchNoExcept( opCtx, nss, requestedVersion, forceRefresh /*forceRefreshFromThisThread*/); { diff --git a/src/mongo/db/s/shard_filtering_metadata_refresh.cpp b/src/mongo/db/s/shard_filtering_metadata_refresh.cpp index 8bb5b5a2614..0f284f1f6dd 100644 --- a/src/mongo/db/s/shard_filtering_metadata_refresh.cpp +++ b/src/mongo/db/s/shard_filtering_metadata_refresh.cpp @@ -47,10 +47,12 @@ namespace mongo { -Status onShardVersionMismatch(OperationContext* opCtx, - const NamespaceString& nss, - ChunkVersion shardVersionReceived, - bool forceRefreshFromThisThread) noexcept { +namespace { + +void onShardVersionMismatch(OperationContext* opCtx, + const NamespaceString& nss, + ChunkVersion shardVersionReceived, + bool forceRefreshFromThisThread) { invariant(!opCtx->lockState()->isLocked()); invariant(!opCtx->getClient()->isInDirectClient()); @@ -65,12 +67,8 @@ Status onShardVersionMismatch(OperationContext* opCtx, // Ensure any ongoing migrations have completed before trying to do the refresh. This wait is // just an optimization so that MongoS does not exhaust its maximum number of StaleConfig retry // attempts while the migration is being committed. - try { - auto& oss = OperationShardingState::get(opCtx); - oss.waitForMigrationCriticalSectionSignal(opCtx); - } catch (const DBException& ex) { - return ex.toStatus(); - } + auto& oss = OperationShardingState::get(opCtx); + oss.waitForMigrationCriticalSectionSignal(opCtx); const auto currentShardVersion = [&] { AutoGetCollection autoColl(opCtx, nss, MODE_IS); @@ -86,11 +84,19 @@ Status onShardVersionMismatch(OperationContext* opCtx, currentShardVersion.majorVersion() >= shardVersionReceived.majorVersion()) { // Don't need to remotely reload if we're in the same epoch and the requested version is // smaller than the one we know about. This means that the remote side is behind. - return Status::OK(); } + forceShardFilteringMetadataRefresh(opCtx, nss, forceRefreshFromThisThread); +} + +} // namespace + +Status onShardVersionMismatchNoExcept(OperationContext* opCtx, + const NamespaceString& nss, + ChunkVersion shardVersionReceived, + bool forceRefreshFromThisThread) noexcept { try { - forceShardFilteringMetadataRefresh(opCtx, nss, forceRefreshFromThisThread); + onShardVersionMismatch(opCtx, nss, shardVersionReceived, forceRefreshFromThisThread); return Status::OK(); } catch (const DBException& ex) { log() << "Failed to refresh metadata for collection" << nss << causedBy(redact(ex)); diff --git a/src/mongo/db/s/shard_filtering_metadata_refresh.h b/src/mongo/db/s/shard_filtering_metadata_refresh.h index c4ae17afd02..a0397726090 100644 --- a/src/mongo/db/s/shard_filtering_metadata_refresh.h +++ b/src/mongo/db/s/shard_filtering_metadata_refresh.h @@ -53,10 +53,10 @@ class OperationContext; * execution state in the response. This is specifically problematic for write commands, which are * expected to return the set of write batch entries that succeeded. */ -Status onShardVersionMismatch(OperationContext* opCtx, - const NamespaceString& nss, - ChunkVersion shardVersionReceived, - bool forceRefreshFromThisThread = false) noexcept; +Status onShardVersionMismatchNoExcept(OperationContext* opCtx, + const NamespaceString& nss, + ChunkVersion shardVersionReceived, + bool forceRefreshFromThisThread = false) noexcept; /** * Unconditionally causes the shard's filtering metadata to be refreshed from the config server and diff --git a/src/mongo/db/service_entry_point_common.cpp b/src/mongo/db/service_entry_point_common.cpp index c068978aeea..23f55f61554 100644 --- a/src/mongo/db/service_entry_point_common.cpp +++ b/src/mongo/db/service_entry_point_common.cpp @@ -888,7 +888,8 @@ void execCommandDatabase(OperationContext* opCtx, if (!opCtx->getClient()->isInDirectClient()) { // We already have the StaleConfig exception, so just swallow any errors due to // refresh - onShardVersionMismatch(opCtx, sce->getNss(), sce->getVersionReceived()).ignore(); + onShardVersionMismatchNoExcept(opCtx, sce->getNss(), sce->getVersionReceived()) + .ignore(); } } else if (auto sce = e.extraInfo<StaleDbRoutingVersion>()) { if (!opCtx->getClient()->isInDirectClient()) { @@ -1072,7 +1073,8 @@ DbResponse receivedQuery(OperationContext* opCtx, if (!opCtx->getClient()->isInDirectClient()) { // We already have the StaleConfig exception, so just swallow any errors due to // refresh - onShardVersionMismatch(opCtx, sce->getNss(), sce->getVersionReceived()).ignore(); + onShardVersionMismatchNoExcept(opCtx, sce->getNss(), sce->getVersionReceived()) + .ignore(); } } |