summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEsha Maharishi <esha.maharishi@mongodb.com>2018-08-27 14:06:28 -0400
committerEsha Maharishi <esha.maharishi@mongodb.com>2018-08-27 16:21:38 -0400
commit2aede7ad2fce2616e4140f2ae398e4e570c84703 (patch)
treedd8e6f6722608f88504b3cd992e7f1c9bef93b42
parent0fe87e465305a82419d49b71bb9618701222065b (diff)
downloadmongo-2aede7ad2fce2616e4140f2ae398e4e570c84703.tar.gz
SERVER-36755 CollectionLock acquisition in shard_filtering_metadata_refresh.cpp can cause server to terminate on stepdown (1/2)
-rw-r--r--src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp4
-rw-r--r--src/mongo/db/s/set_shard_version_command.cpp2
-rw-r--r--src/mongo/db/s/shard_filtering_metadata_refresh.cpp30
-rw-r--r--src/mongo/db/s/shard_filtering_metadata_refresh.h8
-rw-r--r--src/mongo/db/service_entry_point_common.cpp6
5 files changed, 29 insertions, 21 deletions
diff --git a/src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp b/src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp
index 80e1efd2c3d..55c5bfecbcf 100644
--- a/src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp
+++ b/src/mongo/db/s/scoped_operation_completion_sharding_actions.cpp
@@ -72,8 +72,8 @@ ScopedOperationCompletionShardingActions::~ScopedOperationCompletionShardingActi
}
if (auto staleInfo = status->extraInfo<StaleConfigInfo>()) {
- auto handleMismatchStatus =
- onShardVersionMismatch(_opCtx, staleInfo->getNss(), staleInfo->getVersionReceived());
+ auto handleMismatchStatus = onShardVersionMismatchNoExcept(
+ _opCtx, staleInfo->getNss(), staleInfo->getVersionReceived());
if (!handleMismatchStatus.isOK())
log() << "Failed to handle stale version exception"
<< causedBy(redact(handleMismatchStatus));
diff --git a/src/mongo/db/s/set_shard_version_command.cpp b/src/mongo/db/s/set_shard_version_command.cpp
index 0876fc56eb5..e14feab3f7a 100644
--- a/src/mongo/db/s/set_shard_version_command.cpp
+++ b/src/mongo/db/s/set_shard_version_command.cpp
@@ -342,7 +342,7 @@ public:
// Note: The forceRefresh flag controls whether we make sure to do our
// own refresh or if we're okay with joining another thread
- const auto status = onShardVersionMismatch(
+ const auto status = onShardVersionMismatchNoExcept(
opCtx, nss, requestedVersion, forceRefresh /*forceRefreshFromThisThread*/);
{
diff --git a/src/mongo/db/s/shard_filtering_metadata_refresh.cpp b/src/mongo/db/s/shard_filtering_metadata_refresh.cpp
index 8bb5b5a2614..0f284f1f6dd 100644
--- a/src/mongo/db/s/shard_filtering_metadata_refresh.cpp
+++ b/src/mongo/db/s/shard_filtering_metadata_refresh.cpp
@@ -47,10 +47,12 @@
namespace mongo {
-Status onShardVersionMismatch(OperationContext* opCtx,
- const NamespaceString& nss,
- ChunkVersion shardVersionReceived,
- bool forceRefreshFromThisThread) noexcept {
+namespace {
+
+void onShardVersionMismatch(OperationContext* opCtx,
+ const NamespaceString& nss,
+ ChunkVersion shardVersionReceived,
+ bool forceRefreshFromThisThread) {
invariant(!opCtx->lockState()->isLocked());
invariant(!opCtx->getClient()->isInDirectClient());
@@ -65,12 +67,8 @@ Status onShardVersionMismatch(OperationContext* opCtx,
// Ensure any ongoing migrations have completed before trying to do the refresh. This wait is
// just an optimization so that MongoS does not exhaust its maximum number of StaleConfig retry
// attempts while the migration is being committed.
- try {
- auto& oss = OperationShardingState::get(opCtx);
- oss.waitForMigrationCriticalSectionSignal(opCtx);
- } catch (const DBException& ex) {
- return ex.toStatus();
- }
+ auto& oss = OperationShardingState::get(opCtx);
+ oss.waitForMigrationCriticalSectionSignal(opCtx);
const auto currentShardVersion = [&] {
AutoGetCollection autoColl(opCtx, nss, MODE_IS);
@@ -86,11 +84,19 @@ Status onShardVersionMismatch(OperationContext* opCtx,
currentShardVersion.majorVersion() >= shardVersionReceived.majorVersion()) {
// Don't need to remotely reload if we're in the same epoch and the requested version is
// smaller than the one we know about. This means that the remote side is behind.
- return Status::OK();
}
+ forceShardFilteringMetadataRefresh(opCtx, nss, forceRefreshFromThisThread);
+}
+
+} // namespace
+
+Status onShardVersionMismatchNoExcept(OperationContext* opCtx,
+ const NamespaceString& nss,
+ ChunkVersion shardVersionReceived,
+ bool forceRefreshFromThisThread) noexcept {
try {
- forceShardFilteringMetadataRefresh(opCtx, nss, forceRefreshFromThisThread);
+ onShardVersionMismatch(opCtx, nss, shardVersionReceived, forceRefreshFromThisThread);
return Status::OK();
} catch (const DBException& ex) {
log() << "Failed to refresh metadata for collection" << nss << causedBy(redact(ex));
diff --git a/src/mongo/db/s/shard_filtering_metadata_refresh.h b/src/mongo/db/s/shard_filtering_metadata_refresh.h
index c4ae17afd02..a0397726090 100644
--- a/src/mongo/db/s/shard_filtering_metadata_refresh.h
+++ b/src/mongo/db/s/shard_filtering_metadata_refresh.h
@@ -53,10 +53,10 @@ class OperationContext;
* execution state in the response. This is specifically problematic for write commands, which are
* expected to return the set of write batch entries that succeeded.
*/
-Status onShardVersionMismatch(OperationContext* opCtx,
- const NamespaceString& nss,
- ChunkVersion shardVersionReceived,
- bool forceRefreshFromThisThread = false) noexcept;
+Status onShardVersionMismatchNoExcept(OperationContext* opCtx,
+ const NamespaceString& nss,
+ ChunkVersion shardVersionReceived,
+ bool forceRefreshFromThisThread = false) noexcept;
/**
* Unconditionally causes the shard's filtering metadata to be refreshed from the config server and
diff --git a/src/mongo/db/service_entry_point_common.cpp b/src/mongo/db/service_entry_point_common.cpp
index c068978aeea..23f55f61554 100644
--- a/src/mongo/db/service_entry_point_common.cpp
+++ b/src/mongo/db/service_entry_point_common.cpp
@@ -888,7 +888,8 @@ void execCommandDatabase(OperationContext* opCtx,
if (!opCtx->getClient()->isInDirectClient()) {
// We already have the StaleConfig exception, so just swallow any errors due to
// refresh
- onShardVersionMismatch(opCtx, sce->getNss(), sce->getVersionReceived()).ignore();
+ onShardVersionMismatchNoExcept(opCtx, sce->getNss(), sce->getVersionReceived())
+ .ignore();
}
} else if (auto sce = e.extraInfo<StaleDbRoutingVersion>()) {
if (!opCtx->getClient()->isInDirectClient()) {
@@ -1072,7 +1073,8 @@ DbResponse receivedQuery(OperationContext* opCtx,
if (!opCtx->getClient()->isInDirectClient()) {
// We already have the StaleConfig exception, so just swallow any errors due to
// refresh
- onShardVersionMismatch(opCtx, sce->getNss(), sce->getVersionReceived()).ignore();
+ onShardVersionMismatchNoExcept(opCtx, sce->getNss(), sce->getVersionReceived())
+ .ignore();
}
}