diff options
author | Randolph Tan <randolph@10gen.com> | 2020-02-28 16:07:08 -0500 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-03-03 17:34:22 +0000 |
commit | a68774045ce42d55e82236408bd9cf004c54d12a (patch) | |
tree | 7743d2741025d96512a79cbe0d9ecad74ce369c1 /src | |
parent | 6dc8d169723e787ea6ff1672ffe1465d5610c2a6 (diff) | |
download | mongo-a68774045ce42d55e82236408bd9cf004c54d12a.tar.gz |
SERVER-43846 Add information about how many migration task were leftover from last primary
Diffstat (limited to 'src')
-rw-r--r-- | src/mongo/db/s/migration_util.cpp | 142 | ||||
-rw-r--r-- | src/mongo/db/s/sharding_statistics.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/s/sharding_statistics.h | 4 |
3 files changed, 84 insertions, 64 deletions
diff --git a/src/mongo/db/s/migration_util.cpp b/src/mongo/db/s/migration_util.cpp index 09ebea38a8f..6b58974bab4 100644 --- a/src/mongo/db/s/migration_util.cpp +++ b/src/mongo/db/s/migration_util.cpp @@ -51,6 +51,7 @@ #include "mongo/db/s/collection_sharding_runtime.h" #include "mongo/db/s/migration_coordinator.h" #include "mongo/db/s/shard_filtering_metadata_refresh.h" +#include "mongo/db/s/sharding_statistics.h" #include "mongo/db/s/wait_for_majority_service.h" #include "mongo/db/write_concern.h" #include "mongo/executor/task_executor_pool.h" @@ -774,75 +775,88 @@ void resumeMigrationCoordinationsOnStepUp(ServiceContext* serviceContext) { auto uniqueOpCtx = tc->makeOperationContext(); auto opCtx = uniqueOpCtx.get(); + long long migrationRecoveryCount = 0; PersistentTaskStore<MigrationCoordinatorDocument> store( opCtx, NamespaceString::kMigrationCoordinatorsNamespace); Query query; - store.forEach(opCtx, query, [&opCtx](const MigrationCoordinatorDocument& doc) { - LOGV2_DEBUG(22039, 2, "Recovering migration {doc}", "doc"_attr = doc.toBSON()); - - // Create a MigrationCoordinator to complete the coordination. - MigrationCoordinator coordinator(doc); - - if (doc.getDecision()) { - // The decision is already known. - coordinator.setMigrationDecision( - (*doc.getDecision()) == DecisionEnum::kCommitted - ? MigrationCoordinator::Decision::kCommitted - : MigrationCoordinator::Decision::kAborted); + store.forEach( + opCtx, + query, + [&opCtx, &migrationRecoveryCount](const MigrationCoordinatorDocument& doc) { + LOGV2_DEBUG(22039, 2, "Recovering migration {doc}", "doc"_attr = doc.toBSON()); + + migrationRecoveryCount++; + + // Create a MigrationCoordinator to complete the coordination. + MigrationCoordinator coordinator(doc); + + if (doc.getDecision()) { + // The decision is already known. + coordinator.setMigrationDecision( + (*doc.getDecision()) == DecisionEnum::kCommitted + ? MigrationCoordinator::Decision::kCommitted + : MigrationCoordinator::Decision::kAborted); + coordinator.completeMigration(opCtx); + return true; + } + + // The decision is not known. Recover the decision from the config server. + + ensureChunkVersionIsGreaterThan( + opCtx, doc.getRange(), doc.getPreMigrationChunkVersion()); + + hangBeforeFilteringMetadataRefresh.pauseWhileSet(); + + refreshFilteringMetadataUntilSuccess(opCtx, doc.getNss()); + + auto refreshedMetadata = [&] { + AutoGetCollection autoColl(opCtx, doc.getNss(), MODE_IS); + auto* const css = CollectionShardingRuntime::get(opCtx, doc.getNss()); + return css->getCurrentMetadataIfKnown(); + }(); + + if (!refreshedMetadata || !(*refreshedMetadata)->isSharded() || + !(*refreshedMetadata)->uuidMatches(doc.getCollectionUuid())) { + LOGV2( + 22040, + "Even after forced refresh, filtering metadata for namespace in " + "migration coordinator doc " + "{doc}{refreshedMetadata_refreshedMetadata_isSharded_is_not_known_has_" + "UUID_" + "that_does_not_match_the_collection_UUID_in_the_coordinator_doc}. " + "Deleting " + "the range deletion tasks on the donor and recipient as " + "well as the migration coordinator document on this node.", + "doc"_attr = doc.toBSON(), + "refreshedMetadata_refreshedMetadata_isSharded_is_not_known_has_UUID_that_does_not_match_the_collection_UUID_in_the_coordinator_doc"_attr = + (!refreshedMetadata || !(*refreshedMetadata)->isSharded() + ? "is not known" + : "has UUID that does not match the collection UUID in the " + "coordinator doc")); + + // TODO (SERVER-45707): Test that range deletion tasks are eventually + // deleted even if the collection is dropped before migration coordination + // is resumed. + deleteRangeDeletionTaskOnRecipient( + opCtx, doc.getRecipientShardId(), doc.getId()); + deleteRangeDeletionTaskLocally(opCtx, doc.getId()); + coordinator.forgetMigration(opCtx); + return true; + } + + if ((*refreshedMetadata)->keyBelongsToMe(doc.getRange().getMin())) { + coordinator.setMigrationDecision(MigrationCoordinator::Decision::kAborted); + } else { + coordinator.setMigrationDecision( + MigrationCoordinator::Decision::kCommitted); + } + coordinator.completeMigration(opCtx); return true; - } - - // The decision is not known. Recover the decision from the config server. - - ensureChunkVersionIsGreaterThan( - opCtx, doc.getRange(), doc.getPreMigrationChunkVersion()); - - hangBeforeFilteringMetadataRefresh.pauseWhileSet(); - - refreshFilteringMetadataUntilSuccess(opCtx, doc.getNss()); - - auto refreshedMetadata = [&] { - AutoGetCollection autoColl(opCtx, doc.getNss(), MODE_IS); - auto* const css = CollectionShardingRuntime::get(opCtx, doc.getNss()); - return css->getCurrentMetadataIfKnown(); - }(); - - if (!refreshedMetadata || !(*refreshedMetadata)->isSharded() || - !(*refreshedMetadata)->uuidMatches(doc.getCollectionUuid())) { - LOGV2( - 22040, - "Even after forced refresh, filtering metadata for namespace in " - "migration coordinator doc " - "{doc}{refreshedMetadata_refreshedMetadata_isSharded_is_not_known_has_UUID_" - "that_does_not_match_the_collection_UUID_in_the_coordinator_doc}. Deleting " - "the range deletion tasks on the donor and recipient as " - "well as the migration coordinator document on this node.", - "doc"_attr = doc.toBSON(), - "refreshedMetadata_refreshedMetadata_isSharded_is_not_known_has_UUID_that_does_not_match_the_collection_UUID_in_the_coordinator_doc"_attr = - (!refreshedMetadata || !(*refreshedMetadata)->isSharded() - ? "is not known" - : "has UUID that does not match the collection UUID in the " - "coordinator doc")); - - // TODO (SERVER-45707): Test that range deletion tasks are eventually - // deleted even if the collection is dropped before migration coordination - // is resumed. - deleteRangeDeletionTaskOnRecipient( - opCtx, doc.getRecipientShardId(), doc.getId()); - deleteRangeDeletionTaskLocally(opCtx, doc.getId()); - coordinator.forgetMigration(opCtx); - return true; - } - - if ((*refreshedMetadata)->keyBelongsToMe(doc.getRange().getMin())) { - coordinator.setMigrationDecision(MigrationCoordinator::Decision::kAborted); - } else { - coordinator.setMigrationDecision(MigrationCoordinator::Decision::kCommitted); - } - coordinator.completeMigration(opCtx); - return true; - }); + }); + + ShardingStatistics::get(opCtx).unfinishedMigrationFromPreviousPrimary.store( + migrationRecoveryCount); }) .getAsync([](const Status& status) { if (!status.isOK()) { diff --git a/src/mongo/db/s/sharding_statistics.cpp b/src/mongo/db/s/sharding_statistics.cpp index 30584d9706a..a2fac5ad48f 100644 --- a/src/mongo/db/s/sharding_statistics.cpp +++ b/src/mongo/db/s/sharding_statistics.cpp @@ -65,6 +65,8 @@ void ShardingStatistics::report(BSONObjBuilder* builder) const { builder->append("countDonorMoveChunkLockTimeout", countDonorMoveChunkLockTimeout.load()); builder->append("countDonorMoveChunkAbortConflictingIndexOperation", countDonorMoveChunkAbortConflictingIndexOperation.load()); + builder->append("unfinishedMigrationFromPreviousPrimary", + unfinishedMigrationFromPreviousPrimary.load()); } } // namespace mongo diff --git a/src/mongo/db/s/sharding_statistics.h b/src/mongo/db/s/sharding_statistics.h index 7aae7a24501..79547043949 100644 --- a/src/mongo/db/s/sharding_statistics.h +++ b/src/mongo/db/s/sharding_statistics.h @@ -95,6 +95,10 @@ struct ShardingStatistics { // due to concurrent index operations. AtomicWord<long long> countDonorMoveChunkAbortConflictingIndexOperation{0}; + // Total number of migrations leftover from previous primaries that needs to be run to + // completion. Valid only when this process is the repl set primary. + AtomicWord<long long> unfinishedMigrationFromPreviousPrimary{0}; + /** * Obtains the per-process instance of the sharding statistics object. */ |