diff options
author | Dianna Hohensee <dianna.hohensee@10gen.com> | 2016-11-14 08:31:42 -0500 |
---|---|---|
committer | Dianna Hohensee <dianna.hohensee@10gen.com> | 2016-11-15 15:39:51 -0500 |
commit | eacdb58313a1b464e89c44868527fcadc22a67a6 (patch) | |
tree | 8d60d29245a0671099efd1298b504c1fe3774793 /src/mongo/db/s | |
parent | e0b312bbe4f2c50470560b92fbcfbdd3e0471d2f (diff) | |
download | mongo-eacdb58313a1b464e89c44868527fcadc22a67a6.tar.gz |
SERVER-26116 reacquire the balancer distlock in drain mode during config primary step-up
Diffstat (limited to 'src/mongo/db/s')
-rw-r--r-- | src/mongo/db/s/balancer/balancer.cpp | 16 | ||||
-rw-r--r-- | src/mongo/db/s/balancer/migration_manager.cpp | 60 | ||||
-rw-r--r-- | src/mongo/db/s/balancer/migration_manager.h | 12 | ||||
-rw-r--r-- | src/mongo/db/s/balancer/migration_manager_test.cpp | 11 |
4 files changed, 60 insertions, 39 deletions
diff --git a/src/mongo/db/s/balancer/balancer.cpp b/src/mongo/db/s/balancer/balancer.cpp index 48979432fcc..51f06910946 100644 --- a/src/mongo/db/s/balancer/balancer.cpp +++ b/src/mongo/db/s/balancer/balancer.cpp @@ -308,17 +308,11 @@ void Balancer::_mainThread() { // Take the balancer distributed lock and hold it permanently. Do the attempts with single // attempts in order to not block the thread and be able to check for interrupt more frequently. while (!_stopRequested()) { - auto distLockHandleStatus = - shardingContext->catalogClient(txn.get())->getDistLockManager()->lockWithSessionID( - txn.get(), - "balancer", - "CSRS Balancer", - OID::gen(), - DistLockManager::kSingleLockAttemptTimeout); - if (!distLockHandleStatus.isOK()) { - warning() << "Balancer distributed lock could not be acquired and will be retried in " - << durationCount<Seconds>(kInitBackoffInterval) << " seconds" - << causedBy(distLockHandleStatus.getStatus()); + auto status = _migrationManager.tryTakeBalancerLock(txn.get(), "CSRS Balancer"); + if (!status.isOK()) { + log() << "Balancer distributed lock could not be acquired and will be retried in " + << durationCount<Seconds>(kInitBackoffInterval) << " seconds" + << causedBy(redact(status)); _sleepFor(txn.get(), kInitBackoffInterval); continue; diff --git a/src/mongo/db/s/balancer/migration_manager.cpp b/src/mongo/db/s/balancer/migration_manager.cpp index 2be60547810..ce81d189d2a 100644 --- a/src/mongo/db/s/balancer/migration_manager.cpp +++ b/src/mongo/db/s/balancer/migration_manager.cpp @@ -130,7 +130,7 @@ bool isErrorDueToConfigStepdown(Status status, bool isStopping) { } // namespace MigrationManager::MigrationManager(ServiceContext* serviceContext) - : _serviceContext(serviceContext), _lockSessionID(OID::gen()) {} + : _serviceContext(serviceContext) {} MigrationManager::~MigrationManager() { // The migration manager must be completely quiesced at destruction time @@ -290,6 +290,17 @@ void MigrationManager::startRecoveryAndAcquireDistLocks(OperationContext* txn) { _abandonActiveMigrationsAndEnableManager(txn); }); + auto distLockManager = Grid::get(txn)->catalogClient(txn)->getDistLockManager(); + + // Must claim the balancer lock to prevent any 3.2 mongos clients from acquiring it. + auto balancerLockStatus = distLockManager->tryLockWithLocalWriteConcern( + txn, "balancer", "CSRS Balancer", _lockSessionID); + if (!balancerLockStatus.isOK()) { + log() << "Failed to acquire balancer distributed lock. Abandoning balancer recovery." + << causedBy(redact(balancerLockStatus.getStatus())); + return; + } + // Load the active migrations from the config.migrations collection. auto statusWithMigrationsQueryResponse = Grid::get(txn)->shardRegistry()->getConfigShard()->exhaustiveFindOnConfig( @@ -302,9 +313,9 @@ void MigrationManager::startRecoveryAndAcquireDistLocks(OperationContext* txn) { boost::none); if (!statusWithMigrationsQueryResponse.isOK()) { - warning() << "Unable to read config.migrations collection documents for balancer migration" - << " recovery. Abandoning balancer recovery." - << causedBy(redact(statusWithMigrationsQueryResponse.getStatus())); + log() << "Unable to read config.migrations collection documents for balancer migration" + << " recovery. Abandoning balancer recovery." + << causedBy(redact(statusWithMigrationsQueryResponse.getStatus())); return; } @@ -314,10 +325,9 @@ void MigrationManager::startRecoveryAndAcquireDistLocks(OperationContext* txn) { // The format of this migration document is incorrect. The balancer holds a distlock for // this migration, but without parsing the migration document we cannot identify which // distlock must be released. So we must release all distlocks. - warning() << "Unable to parse config.migrations document '" - << redact(migration.toString()) - << "' for balancer migration recovery. Abandoning balancer recovery." - << causedBy(redact(statusWithMigrationType.getStatus())); + log() << "Unable to parse config.migrations document '" << redact(migration.toString()) + << "' for balancer migration recovery. Abandoning balancer recovery." + << causedBy(redact(statusWithMigrationType.getStatus())); return; } MigrationType migrateType = std::move(statusWithMigrationType.getValue()); @@ -330,21 +340,18 @@ void MigrationManager::startRecoveryAndAcquireDistLocks(OperationContext* txn) { // Reacquire the matching distributed lock for this namespace. const std::string whyMessage(stream() << "Migrating chunk(s) in collection " << migrateType.getNss().ns()); - auto statusWithDistLockHandle = - Grid::get(txn) - ->catalogClient(txn) - ->getDistLockManager() - ->tryLockWithLocalWriteConcern( - txn, migrateType.getNss().ns(), whyMessage, _lockSessionID); + + auto statusWithDistLockHandle = distLockManager->tryLockWithLocalWriteConcern( + txn, migrateType.getNss().ns(), whyMessage, _lockSessionID); if (!statusWithDistLockHandle.isOK() && statusWithDistLockHandle.getStatus() != ErrorCodes::LockBusy) { // LockBusy is alright because that should mean a 3.2 shard has it for the active // migration. - warning() << "Failed to acquire distributed lock for collection '" - << migrateType.getNss().ns() - << "' during balancer recovery of an active migration. Abandoning" - << " balancer recovery." - << causedBy(redact(statusWithDistLockHandle.getStatus())); + log() << "Failed to acquire distributed lock for collection '" + << migrateType.getNss().ns() + << "' during balancer recovery of an active migration. Abandoning" + << " balancer recovery." + << causedBy(redact(statusWithDistLockHandle.getStatus())); return; } } @@ -393,9 +400,9 @@ void MigrationManager::finishRecovery(OperationContext* txn, // This shouldn't happen because the collection was intact and sharded when the previous // config primary was active and the dist locks have been held by the balancer // throughout. Abort migration recovery. - warning() << "Unable to reload chunk metadata for collection '" << nss - << "' during balancer recovery. Abandoning recovery." - << causedBy(redact(scopedCMStatus.getStatus())); + log() << "Unable to reload chunk metadata for collection '" << nss + << "' during balancer recovery. Abandoning recovery." + << causedBy(redact(scopedCMStatus.getStatus())); return; } @@ -793,6 +800,15 @@ Status MigrationManager::_processRemoteCommandResponse( return commandStatus; } +Status MigrationManager::tryTakeBalancerLock(OperationContext* txn, StringData whyMessage) { + return Grid::get(txn) + ->catalogClient(txn) + ->getDistLockManager() + ->lockWithSessionID( + txn, "balancer", whyMessage, _lockSessionID, DistLockManager::kSingleLockAttemptTimeout) + .getStatus(); +} + MigrationManager::Migration::Migration(NamespaceString inNss, BSONObj inMoveChunkCmdObj) : nss(std::move(inNss)), moveChunkCmdObj(std::move(inMoveChunkCmdObj)), diff --git a/src/mongo/db/s/balancer/migration_manager.h b/src/mongo/db/s/balancer/migration_manager.h index 8bdc9fdd401..b6ae1dd3ff3 100644 --- a/src/mongo/db/s/balancer/migration_manager.h +++ b/src/mongo/db/s/balancer/migration_manager.h @@ -102,8 +102,9 @@ public: /** * Non-blocking method that puts the migration manager in the kRecovering state, in which - * new migration requests will block until finishRecovery is called. Then does local writes to - * reacquire the distributed locks for active migrations. + * new migration requests will block until finishRecovery is called. Then reacquires distributed + * locks for the balancer and any active migrations. The distributed locks are taken with local + * write concern, since this is called in drain mode where majority writes are not yet possible. * * The active migration recovery may fail and be abandoned, setting the state to kEnabled. */ @@ -138,6 +139,11 @@ public: */ void drainActiveMigrations(); + /** + * Tries to take or take over the balancer distributed lock. + */ + Status tryTakeBalancerLock(OperationContext* txn, StringData whyMessage); + private: // The current state of the migration manager enum class State { // Allowed transitions: @@ -282,7 +288,7 @@ private: // Used as a constant session ID for all distributed locks that this MigrationManager holds. // Currently required so that locks can be reacquired for the balancer in startRecovery and then // overtaken in later operations. - OID _lockSessionID; + const OID _lockSessionID{OID::gen()}; // Carries migration information over from startRecovery to finishRecovery. Should only be set // in startRecovery and then accessed in finishRecovery. diff --git a/src/mongo/db/s/balancer/migration_manager_test.cpp b/src/mongo/db/s/balancer/migration_manager_test.cpp index 9e6b8aa1eac..1c5ead4acbf 100644 --- a/src/mongo/db/s/balancer/migration_manager_test.cpp +++ b/src/mongo/db/s/balancer/migration_manager_test.cpp @@ -116,8 +116,9 @@ protected: void setUpMigration(const ChunkType& chunk, const ShardId& toShard); /** - * Asserts that config.migrations is empty and config.locks contains no locked documents, both - * of which should be true if the MigrationManager is inactive and behaving properly. + * Asserts that config.migrations is empty and config.locks contains no locked documents other + * than the balancer's, both of which should be true if the MigrationManager is inactive and + * behaving properly. */ void checkMigrationsCollectionIsEmptyAndLocksAreUnlocked(); @@ -250,7 +251,7 @@ void MigrationManagerTest::checkMigrationsCollectionIsEmptyAndLocksAreUnlocked() ReadPreferenceSetting{ReadPreference::PrimaryOnly}, repl::ReadConcernLevel::kMajorityReadConcern, NamespaceString(LocksType::ConfigNS), - BSON(LocksType::state(LocksType::LOCKED)), + BSON(LocksType::state(LocksType::LOCKED) << LocksType::name("{ '$ne' : 'balancer'}")), BSONObj(), boost::none); Shard::QueryResponse locksQueryResponse = uassertStatusOK(statusWithLocksQueryResponse); @@ -873,6 +874,10 @@ TEST_F(MigrationManagerTest, MigrationRecovery) { setUpMigration(chunk1, kShardId1.toString()); setUpMigration(chunk2, kShardId3.toString()); + // Mimic all config distlocks being released on config server stepup to primary. + auto distLockManager = catalogClient()->getDistLockManager(); + distLockManager->unlockAll(operationContext(), distLockManager->getProcessID()); + _migrationManager->startRecoveryAndAcquireDistLocks(operationContext()); auto future = launchAsync([this] { |