summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPierlauro Sciarelli <pierlauro.sciarelli@mongodb.com>2021-08-06 15:31:39 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-08-06 16:23:07 +0000
commit3bdbe14de35d849f729221805c22b9ce76a1a9d6 (patch)
tree4001add9ecf41fbfe4ae7c62c4668b49e74df9a0
parentd711b1b0c5cb2bee147bba14081f0356b36f1e4a (diff)
downloadmongo-3bdbe14de35d849f729221805c22b9ce76a1a9d6.tar.gz
SERVER-59152 Make range deletion submission resilient to onShardVersionMismatch exceptions
-rw-r--r--src/mongo/db/s/migration_util.cpp88
-rw-r--r--src/mongo/db/s/migration_util.h6
2 files changed, 48 insertions, 46 deletions
diff --git a/src/mongo/db/s/migration_util.cpp b/src/mongo/db/s/migration_util.cpp
index af22b7a8929..99a11b8a251 100644
--- a/src/mongo/db/s/migration_util.cpp
+++ b/src/mongo/db/s/migration_util.cpp
@@ -239,6 +239,28 @@ void retryIdempotentWorkAsPrimaryUntilSuccessOrStepdown(
}
}
+void refreshFilteringMetadataUntilSuccess(OperationContext* opCtx, const NamespaceString& nss) {
+ retryIdempotentWorkAsPrimaryUntilSuccessOrStepdown(
+ opCtx, "refreshFilteringMetadataUntilSuccess", [&nss](OperationContext* newOpCtx) {
+ hangInRefreshFilteringMetadataUntilSuccessInterruptible.pauseWhileSet(newOpCtx);
+
+ try {
+ onShardVersionMismatch(newOpCtx, nss, boost::none);
+ } catch (const ExceptionFor<ErrorCodes::NamespaceNotFound>&) {
+ // Can throw NamespaceNotFound if the collection/database was dropped
+ }
+
+ if (hangInRefreshFilteringMetadataUntilSuccessThenSimulateErrorUninterruptible
+ .shouldFail()) {
+ hangInRefreshFilteringMetadataUntilSuccessThenSimulateErrorUninterruptible
+ .pauseWhileSet();
+ uasserted(ErrorCodes::InternalError,
+ "simulate an error response for onShardVersionMismatch");
+ }
+ });
+}
+
+
} // namespace
std::shared_ptr<executor::ThreadPoolTaskExecutor> getMigrationUtilExecutor(
@@ -393,48 +415,34 @@ ExecutorFuture<void> submitRangeDeletionTask(OperationContext* opCtx,
<< " because the disableResumableRangeDeleter server parameter is set to true",
!disableResumableRangeDeleter.load());
- // Make sure the collection metadata is up-to-date.
- while (true) {
- {
- AutoGetCollection autoColl(opCtx, deletionTask.getNss(), MODE_IS);
- auto csr = CollectionShardingRuntime::get(opCtx, deletionTask.getNss());
- auto optCollDescr = csr->getCurrentMetadataIfKnown();
-
- if (deletionTaskUuidMatchesFilteringMetadataUuid(
- opCtx, optCollDescr, deletionTask)) {
- break;
- }
-
- // If the collection's filtering metadata is not known, is unsharded, or its
- // UUID does not match the UUID of the deletion task, force a filtering metadata
- // refresh, because this node may have just stepped up and therefore may have a
- // stale cache.
- LOGV2(22024,
- "Filtering metadata for this range deletion task may be outdated; "
- "forcing refresh",
- "deletionTask"_attr = redact(deletionTask.toBSON()),
- "error"_attr =
- (optCollDescr ? (optCollDescr->isSharded()
- ? "Collection has UUID that does not match "
- "UUID of the deletion task"
- : "Collection is unsharded")
- : "Collection's sharding state is not known"),
- "namespace"_attr = deletionTask.getNss(),
- "migrationId"_attr = deletionTask.getId());
- }
-
- try {
- onShardVersionMismatch(opCtx, deletionTask.getNss(), boost::none);
- } catch (const ExceptionFor<ErrorCodes::NamespaceNotFound>&) {
- // If the database has been dropped, don't retry to get the shard version
- break;
- }
-
+ // Make sure the collection metadata is up-to-date before submitting.
+ boost::optional<CollectionMetadata> optCollDescr;
+ {
AutoGetCollection autoColl(opCtx, deletionTask.getNss(), MODE_IS);
auto csr = CollectionShardingRuntime::get(opCtx, deletionTask.getNss());
- if (csr->getCurrentMetadataIfKnown()) {
- break;
- }
+ optCollDescr = csr->getCurrentMetadataIfKnown();
+ }
+
+ if (!deletionTaskUuidMatchesFilteringMetadataUuid(opCtx, optCollDescr, deletionTask)) {
+
+ // If the collection's filtering metadata is not known, is unsharded, or its
+ // UUID does not match the UUID of the deletion task, force a filtering metadata
+ // refresh, because this node may have just stepped up and therefore may have a
+ // stale cache.
+ LOGV2(22024,
+ "Filtering metadata for this range deletion task may be outdated; "
+ "forcing refresh",
+ "deletionTask"_attr = redact(deletionTask.toBSON()),
+ "error"_attr =
+ (optCollDescr ? (optCollDescr->isSharded()
+ ? "Collection has UUID that does not match "
+ "UUID of the deletion task"
+ : "Collection is unsharded")
+ : "Collection's sharding state is not known"),
+ "namespace"_attr = deletionTask.getNss(),
+ "migrationId"_attr = deletionTask.getId());
+
+ refreshFilteringMetadataUntilSuccess(opCtx, deletionTask.getNss());
}
return AsyncTry([=]() {
diff --git a/src/mongo/db/s/migration_util.h b/src/mongo/db/s/migration_util.h
index 918371be77e..865056b36a3 100644
--- a/src/mongo/db/s/migration_util.h
+++ b/src/mongo/db/s/migration_util.h
@@ -214,12 +214,6 @@ void ensureChunkVersionIsGreaterThan(OperationContext* opCtx,
const ChunkVersion& preMigrationChunkVersion);
/**
- * Forces a filtering metadata refresh of the namespace until the refresh succeeds or the node
- * steps down or shuts down.
- */
-void refreshFilteringMetadataUntilSuccess(OperationContext* opCtx, const NamespaceString& nss);
-
-/**
* Submits an asynchronous task to scan config.migrationCoordinators and drive each unfinished
* migration coordination to completion.
*/