diff options
author | Marcos José Grillo Ramírez <marcos.grillo@mongodb.com> | 2020-06-16 17:15:16 +0200 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-06-17 12:42:00 +0000 |
commit | 1edd4798f8e72f226ad69269a8b0154b247a8049 (patch) | |
tree | d1304ce4dd9bb175c5c789edbaa11d469ef275ff /src/mongo/db/s/migration_source_manager.cpp | |
parent | 1bfaa4c1ff3be51bea5e63b9b0f0f6d693d9e36c (diff) | |
download | mongo-1edd4798f8e72f226ad69269a8b0154b247a8049.tar.gz |
SERVER-47982 Change the shard version update procedure of the migration source manager
Diffstat (limited to 'src/mongo/db/s/migration_source_manager.cpp')
-rw-r--r-- | src/mongo/db/s/migration_source_manager.cpp | 63 |
1 files changed, 39 insertions, 24 deletions
diff --git a/src/mongo/db/s/migration_source_manager.cpp b/src/mongo/db/s/migration_source_manager.cpp index 643aa08f353..dfdaf253b1a 100644 --- a/src/mongo/db/s/migration_source_manager.cpp +++ b/src/mongo/db/s/migration_source_manager.cpp @@ -140,7 +140,8 @@ MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx, "requestParameters"_attr = redact(_args.toString()), "collectionEpoch"_attr = _args.getVersionEpoch()); - // Force refresh of the metadata to ensure we have the latest + // Make sure the latest shard version is recovered as of the time of the invocation of the + // command. onShardVersionMismatch(_opCtx, getNss(), boost::none); // Snapshot the committed metadata from the time the migration starts @@ -440,29 +441,43 @@ Status MigrationSourceManager::commitChunkMetadataOnConfig() { Shard::CommandResponse::getEffectiveStatus(commitChunkMigrationResponse); if (!migrationCommitStatus.isOK()) { - migrationutil::ensureChunkVersionIsGreaterThan(_opCtx, _args.getRange(), _chunkVersion); - } - - migrationutil::refreshFilteringMetadataUntilSuccess(_opCtx, getNss()); - - const auto refreshedMetadata = _getCurrentMetadataAndCheckEpoch(); + { + UninterruptibleLockGuard noInterrupt(_opCtx->lockState()); + AutoGetCollection autoColl(_opCtx, getNss(), MODE_IX); + auto* const csr = CollectionShardingRuntime::get(_opCtx, getNss()); + auto csrLock = CollectionShardingRuntime::CSRLock::lockExclusive(_opCtx, csr); - if (refreshedMetadata.keyBelongsToMe(_args.getMinKey())) { - // This condition may only happen if the migration commit has failed for any reason - if (migrationCommitStatus.isOK()) { - return {ErrorCodes::ConflictingOperationInProgress, - "Migration commit succeeded but refresh found that the chunk is still owned; " - "this node may be a stale primary of its replica set, and the new primary may " - "have re-received the chunk"}; + CollectionShardingRuntime::get(_opCtx, getNss())->clearFilteringMetadata(); } + scopedGuard.dismiss(); + _cleanup(false); + // Best-effort recover of the shard version. + onShardVersionMismatchNoExcept(_opCtx, getNss(), boost::none).ignore(); + return migrationCommitStatus; + } - _coordinator->setMigrationDecision(migrationutil::MigrationCoordinator::Decision::kAborted); + try { + forceShardFilteringMetadataRefresh(_opCtx, getNss(), true); + } catch (const DBException& ex) { + { + UninterruptibleLockGuard noInterrupt(_opCtx->lockState()); + AutoGetCollection autoColl(_opCtx, getNss(), MODE_IX); + auto* const csr = CollectionShardingRuntime::get(_opCtx, getNss()); + auto csrLock = CollectionShardingRuntime::CSRLock::lockExclusive(_opCtx, csr); - // The chunk modification was not applied, so report the original error - return migrationCommitStatus.withContext("Chunk move was not successful"); + CollectionShardingRuntime::get(_opCtx, getNss())->clearFilteringMetadata(); + } + scopedGuard.dismiss(); + _cleanup(false); + // Best-effort recover of the shard version. + onShardVersionMismatchNoExcept(_opCtx, getNss(), boost::none).ignore(); + return ex.toStatus(); } // Migration succeeded + + const auto refreshedMetadata = _getCurrentMetadataAndCheckEpoch(); + LOGV2(22018, "Migration succeeded and updated collection version to {updatedCollectionVersion}", "Migration succeeded and updated collection version", @@ -479,7 +494,7 @@ Status MigrationSourceManager::commitChunkMetadataOnConfig() { // Exit the critical section and ensure that all the necessary state is fully persisted before // scheduling orphan cleanup. - _cleanup(); + _cleanup(true); ShardingLogging::get(_opCtx)->logChange( _opCtx, @@ -537,7 +552,7 @@ void MigrationSourceManager::cleanupOnError() { ShardingCatalogClient::kMajorityWriteConcern); try { - _cleanup(); + _cleanup(true); } catch (const DBException& ex) { LOGV2_WARNING(22022, "Failed to clean up migration with request parameters " @@ -613,7 +628,7 @@ void MigrationSourceManager::_notifyChangeStreamsOnRecipientFirstChunk( }); } -void MigrationSourceManager::_cleanup() { +void MigrationSourceManager::_cleanup(bool completeMigration) { invariant(_state != kDone); auto cloneDriver = [&]() { @@ -668,15 +683,15 @@ void MigrationSourceManager::_cleanup() { ShardingStateRecovery::endMetadataOp(_opCtx); } - if (_state >= kCloning) { + if (completeMigration && _state >= kCloning) { invariant(_coordinator); if (_state < kCommittingOnConfig) { _coordinator->setMigrationDecision( migrationutil::MigrationCoordinator::Decision::kAborted); } - // This can be called on an exception path after the OperationContext has been - // interrupted, so use a new OperationContext. Note, it's valid to call - // getServiceContext on an interrupted OperationContext. + // This can be called on an exception path after the OperationContext has been interrupted, + // so use a new OperationContext. Note, it's valid to call getServiceContext on an + // interrupted OperationContext. auto newClient = _opCtx->getServiceContext()->makeClient("MigrationCoordinator"); { stdx::lock_guard<Client> lk(*newClient.get()); |