diff options
author | Esha Maharishi <esha.maharishi@mongodb.com> | 2019-12-12 18:41:44 +0000 |
---|---|---|
committer | evergreen <evergreen@mongodb.com> | 2019-12-12 18:41:44 +0000 |
commit | af51dce90f14ae0671e2151c85bff1590df40148 (patch) | |
tree | 7f7c72bbe5d0f141dfa876db825c53d4c6c0e6a3 /src/mongo | |
parent | c9b4a3d1d736036b3a6669953952308e96b3d7f7 (diff) | |
download | mongo-af51dce90f14ae0671e2151c85bff1590df40148.tar.gz |
SERVER-44975 Make donor shard retry refreshing its filtering metadata until it succeeds before leaving the critical section
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/s/migration_source_manager.cpp | 57 |
1 files changed, 24 insertions, 33 deletions
diff --git a/src/mongo/db/s/migration_source_manager.cpp b/src/mongo/db/s/migration_source_manager.cpp index fad4565949d..d839880fac0 100644 --- a/src/mongo/db/s/migration_source_manager.cpp +++ b/src/mongo/db/s/migration_source_manager.cpp @@ -485,29 +485,27 @@ Status MigrationSourceManager::commitChunkMetadataOnConfig() { << "metadata also failed")); } - // Do a best effort attempt to incrementally refresh the metadata before leaving the critical - // section. It is okay if the refresh fails because that will cause the metadata to be cleared - // and subsequent callers will try to do a full refresh. - try { - forceShardFilteringMetadataRefresh(_opCtx, getNss(), true); - } catch (const DBException& ex) { - UninterruptibleLockGuard noInterrupt(_opCtx->lockState()); - AutoGetCollection autoColl(_opCtx, getNss(), MODE_IX); - - CollectionShardingRuntime::get(_opCtx, getNss())->clearFilteringMetadata(); - - log() << "Failed to refresh metadata after a " - << (migrationCommitStatus.isOK() ? "failed commit attempt" : "successful commit") - << ". Metadata was cleared so it will get a full refresh when accessed again." - << causedBy(redact(ex.toStatus())); + // Incrementally refresh the metadata before leaving the critical section. + for (int attempts = 1;; attempts++) { + try { + forceShardFilteringMetadataRefresh(_opCtx, getNss(), true); + break; + } catch (const DBException& ex) { + const auto refreshStatus = ex.toStatus(); + + if ((ErrorCodes::isInterruption(refreshStatus.code()) || + ErrorCodes::isShutdownError(refreshStatus.code()) || + refreshStatus == ErrorCodes::CallbackCanceled) && + globalInShutdownDeprecated()) { + // Since the server is already doing a clean shutdown, this call will just join + // the previous shutdown call. + shutdown(waitForShutdown()); + } - // migrationCommitStatus may be OK or an error. The migration is considered a success at - // this point if the commit succeeded. The metadata refresh either occurred or the metadata - // was safely cleared. - return migrationCommitStatus.withContext( - str::stream() << "Orphaned range not cleaned up. Failed to refresh metadata after" - " migration commit due to '" - << ex.toString() << "' after commit failed"); + log() << "Failed to refresh metadata after " << attempts << " attempts, after a " + << (migrationCommitStatus.isOK() ? "failed commit attempt" : "successful commit") + << causedBy(redact(refreshStatus)) << ". Will try to refresh again."; + } } const auto refreshedMetadata = _getCurrentMetadataAndCheckEpoch(); @@ -515,17 +513,10 @@ Status MigrationSourceManager::commitChunkMetadataOnConfig() { if (refreshedMetadata->keyBelongsToMe(_args.getMinKey())) { // This condition may only happen if the migration commit has failed for any reason if (migrationCommitStatus.isOK()) { - severe() << "The migration commit succeeded, but the new chunk placement was not " - "reflected after metadata refresh, which is an indication of an " - "afterOpTime bug."; - severe() << "The current config server opTime is " << Grid::get(_opCtx)->configOpTime(); - severe() << "The commit response came from " - << redact(commitChunkMigrationResponse.getValue().hostAndPort->toString()) - << " and contained"; - severe() << " response: " - << redact(commitChunkMigrationResponse.getValue().response.toString()); - - fassertFailed(50878); + return {ErrorCodes::ConflictingOperationInProgress, + "Migration commit succeeded but refresh found that the chunk is still owned; " + "this node may be a stale primary of its replica set, and the new primary may " + "have re-received the chunk"}; } // The chunk modification was not applied, so report the original error |