summaryrefslogtreecommitdiff
path: root/src/mongo
diff options
context:
space:
mode:
authorEsha Maharishi <esha.maharishi@mongodb.com>2019-12-12 18:41:44 +0000
committerevergreen <evergreen@mongodb.com>2019-12-12 18:41:44 +0000
commitaf51dce90f14ae0671e2151c85bff1590df40148 (patch)
tree7f7c72bbe5d0f141dfa876db825c53d4c6c0e6a3 /src/mongo
parentc9b4a3d1d736036b3a6669953952308e96b3d7f7 (diff)
downloadmongo-af51dce90f14ae0671e2151c85bff1590df40148.tar.gz
SERVER-44975 Make donor shard retry refreshing its filtering metadata until it succeeds before leaving the critical section
Diffstat (limited to 'src/mongo')
-rw-r--r--src/mongo/db/s/migration_source_manager.cpp57
1 files changed, 24 insertions, 33 deletions
diff --git a/src/mongo/db/s/migration_source_manager.cpp b/src/mongo/db/s/migration_source_manager.cpp
index fad4565949d..d839880fac0 100644
--- a/src/mongo/db/s/migration_source_manager.cpp
+++ b/src/mongo/db/s/migration_source_manager.cpp
@@ -485,29 +485,27 @@ Status MigrationSourceManager::commitChunkMetadataOnConfig() {
<< "metadata also failed"));
}
- // Do a best effort attempt to incrementally refresh the metadata before leaving the critical
- // section. It is okay if the refresh fails because that will cause the metadata to be cleared
- // and subsequent callers will try to do a full refresh.
- try {
- forceShardFilteringMetadataRefresh(_opCtx, getNss(), true);
- } catch (const DBException& ex) {
- UninterruptibleLockGuard noInterrupt(_opCtx->lockState());
- AutoGetCollection autoColl(_opCtx, getNss(), MODE_IX);
-
- CollectionShardingRuntime::get(_opCtx, getNss())->clearFilteringMetadata();
-
- log() << "Failed to refresh metadata after a "
- << (migrationCommitStatus.isOK() ? "failed commit attempt" : "successful commit")
- << ". Metadata was cleared so it will get a full refresh when accessed again."
- << causedBy(redact(ex.toStatus()));
+ // Incrementally refresh the metadata before leaving the critical section.
+ for (int attempts = 1;; attempts++) {
+ try {
+ forceShardFilteringMetadataRefresh(_opCtx, getNss(), true);
+ break;
+ } catch (const DBException& ex) {
+ const auto refreshStatus = ex.toStatus();
+
+ if ((ErrorCodes::isInterruption(refreshStatus.code()) ||
+ ErrorCodes::isShutdownError(refreshStatus.code()) ||
+ refreshStatus == ErrorCodes::CallbackCanceled) &&
+ globalInShutdownDeprecated()) {
+ // Since the server is already doing a clean shutdown, this call will just join
+ // the previous shutdown call.
+ shutdown(waitForShutdown());
+ }
- // migrationCommitStatus may be OK or an error. The migration is considered a success at
- // this point if the commit succeeded. The metadata refresh either occurred or the metadata
- // was safely cleared.
- return migrationCommitStatus.withContext(
- str::stream() << "Orphaned range not cleaned up. Failed to refresh metadata after"
- " migration commit due to '"
- << ex.toString() << "' after commit failed");
+ log() << "Failed to refresh metadata after " << attempts << " attempts, after a "
+ << (migrationCommitStatus.isOK() ? "failed commit attempt" : "successful commit")
+ << causedBy(redact(refreshStatus)) << ". Will try to refresh again.";
+ }
}
const auto refreshedMetadata = _getCurrentMetadataAndCheckEpoch();
@@ -515,17 +513,10 @@ Status MigrationSourceManager::commitChunkMetadataOnConfig() {
if (refreshedMetadata->keyBelongsToMe(_args.getMinKey())) {
// This condition may only happen if the migration commit has failed for any reason
if (migrationCommitStatus.isOK()) {
- severe() << "The migration commit succeeded, but the new chunk placement was not "
- "reflected after metadata refresh, which is an indication of an "
- "afterOpTime bug.";
- severe() << "The current config server opTime is " << Grid::get(_opCtx)->configOpTime();
- severe() << "The commit response came from "
- << redact(commitChunkMigrationResponse.getValue().hostAndPort->toString())
- << " and contained";
- severe() << " response: "
- << redact(commitChunkMigrationResponse.getValue().response.toString());
-
- fassertFailed(50878);
+ return {ErrorCodes::ConflictingOperationInProgress,
+ "Migration commit succeeded but refresh found that the chunk is still owned; "
+ "this node may be a stale primary of its replica set, and the new primary may "
+ "have re-received the chunk"};
}
// The chunk modification was not applied, so report the original error