summaryrefslogtreecommitdiff
path: root/src/mongo/db/s
diff options
context:
space:
mode:
authorDianna Hohensee <dianna.hohensee@10gen.com>2016-11-14 08:31:42 -0500
committerDianna Hohensee <dianna.hohensee@10gen.com>2016-11-15 15:39:51 -0500
commiteacdb58313a1b464e89c44868527fcadc22a67a6 (patch)
tree8d60d29245a0671099efd1298b504c1fe3774793 /src/mongo/db/s
parente0b312bbe4f2c50470560b92fbcfbdd3e0471d2f (diff)
downloadmongo-eacdb58313a1b464e89c44868527fcadc22a67a6.tar.gz
SERVER-26116 reacquire the balancer distlock in drain mode during config primary step-up
Diffstat (limited to 'src/mongo/db/s')
-rw-r--r--src/mongo/db/s/balancer/balancer.cpp16
-rw-r--r--src/mongo/db/s/balancer/migration_manager.cpp60
-rw-r--r--src/mongo/db/s/balancer/migration_manager.h12
-rw-r--r--src/mongo/db/s/balancer/migration_manager_test.cpp11
4 files changed, 60 insertions, 39 deletions
diff --git a/src/mongo/db/s/balancer/balancer.cpp b/src/mongo/db/s/balancer/balancer.cpp
index 48979432fcc..51f06910946 100644
--- a/src/mongo/db/s/balancer/balancer.cpp
+++ b/src/mongo/db/s/balancer/balancer.cpp
@@ -308,17 +308,11 @@ void Balancer::_mainThread() {
// Take the balancer distributed lock and hold it permanently. Do the attempts with single
// attempts in order to not block the thread and be able to check for interrupt more frequently.
while (!_stopRequested()) {
- auto distLockHandleStatus =
- shardingContext->catalogClient(txn.get())->getDistLockManager()->lockWithSessionID(
- txn.get(),
- "balancer",
- "CSRS Balancer",
- OID::gen(),
- DistLockManager::kSingleLockAttemptTimeout);
- if (!distLockHandleStatus.isOK()) {
- warning() << "Balancer distributed lock could not be acquired and will be retried in "
- << durationCount<Seconds>(kInitBackoffInterval) << " seconds"
- << causedBy(distLockHandleStatus.getStatus());
+ auto status = _migrationManager.tryTakeBalancerLock(txn.get(), "CSRS Balancer");
+ if (!status.isOK()) {
+ log() << "Balancer distributed lock could not be acquired and will be retried in "
+ << durationCount<Seconds>(kInitBackoffInterval) << " seconds"
+ << causedBy(redact(status));
_sleepFor(txn.get(), kInitBackoffInterval);
continue;
diff --git a/src/mongo/db/s/balancer/migration_manager.cpp b/src/mongo/db/s/balancer/migration_manager.cpp
index 2be60547810..ce81d189d2a 100644
--- a/src/mongo/db/s/balancer/migration_manager.cpp
+++ b/src/mongo/db/s/balancer/migration_manager.cpp
@@ -130,7 +130,7 @@ bool isErrorDueToConfigStepdown(Status status, bool isStopping) {
} // namespace
MigrationManager::MigrationManager(ServiceContext* serviceContext)
- : _serviceContext(serviceContext), _lockSessionID(OID::gen()) {}
+ : _serviceContext(serviceContext) {}
MigrationManager::~MigrationManager() {
// The migration manager must be completely quiesced at destruction time
@@ -290,6 +290,17 @@ void MigrationManager::startRecoveryAndAcquireDistLocks(OperationContext* txn) {
_abandonActiveMigrationsAndEnableManager(txn);
});
+ auto distLockManager = Grid::get(txn)->catalogClient(txn)->getDistLockManager();
+
+ // Must claim the balancer lock to prevent any 3.2 mongos clients from acquiring it.
+ auto balancerLockStatus = distLockManager->tryLockWithLocalWriteConcern(
+ txn, "balancer", "CSRS Balancer", _lockSessionID);
+ if (!balancerLockStatus.isOK()) {
+ log() << "Failed to acquire balancer distributed lock. Abandoning balancer recovery."
+ << causedBy(redact(balancerLockStatus.getStatus()));
+ return;
+ }
+
// Load the active migrations from the config.migrations collection.
auto statusWithMigrationsQueryResponse =
Grid::get(txn)->shardRegistry()->getConfigShard()->exhaustiveFindOnConfig(
@@ -302,9 +313,9 @@ void MigrationManager::startRecoveryAndAcquireDistLocks(OperationContext* txn) {
boost::none);
if (!statusWithMigrationsQueryResponse.isOK()) {
- warning() << "Unable to read config.migrations collection documents for balancer migration"
- << " recovery. Abandoning balancer recovery."
- << causedBy(redact(statusWithMigrationsQueryResponse.getStatus()));
+ log() << "Unable to read config.migrations collection documents for balancer migration"
+ << " recovery. Abandoning balancer recovery."
+ << causedBy(redact(statusWithMigrationsQueryResponse.getStatus()));
return;
}
@@ -314,10 +325,9 @@ void MigrationManager::startRecoveryAndAcquireDistLocks(OperationContext* txn) {
// The format of this migration document is incorrect. The balancer holds a distlock for
// this migration, but without parsing the migration document we cannot identify which
// distlock must be released. So we must release all distlocks.
- warning() << "Unable to parse config.migrations document '"
- << redact(migration.toString())
- << "' for balancer migration recovery. Abandoning balancer recovery."
- << causedBy(redact(statusWithMigrationType.getStatus()));
+ log() << "Unable to parse config.migrations document '" << redact(migration.toString())
+ << "' for balancer migration recovery. Abandoning balancer recovery."
+ << causedBy(redact(statusWithMigrationType.getStatus()));
return;
}
MigrationType migrateType = std::move(statusWithMigrationType.getValue());
@@ -330,21 +340,18 @@ void MigrationManager::startRecoveryAndAcquireDistLocks(OperationContext* txn) {
// Reacquire the matching distributed lock for this namespace.
const std::string whyMessage(stream() << "Migrating chunk(s) in collection "
<< migrateType.getNss().ns());
- auto statusWithDistLockHandle =
- Grid::get(txn)
- ->catalogClient(txn)
- ->getDistLockManager()
- ->tryLockWithLocalWriteConcern(
- txn, migrateType.getNss().ns(), whyMessage, _lockSessionID);
+
+ auto statusWithDistLockHandle = distLockManager->tryLockWithLocalWriteConcern(
+ txn, migrateType.getNss().ns(), whyMessage, _lockSessionID);
if (!statusWithDistLockHandle.isOK() &&
statusWithDistLockHandle.getStatus() != ErrorCodes::LockBusy) {
// LockBusy is alright because that should mean a 3.2 shard has it for the active
// migration.
- warning() << "Failed to acquire distributed lock for collection '"
- << migrateType.getNss().ns()
- << "' during balancer recovery of an active migration. Abandoning"
- << " balancer recovery."
- << causedBy(redact(statusWithDistLockHandle.getStatus()));
+ log() << "Failed to acquire distributed lock for collection '"
+ << migrateType.getNss().ns()
+ << "' during balancer recovery of an active migration. Abandoning"
+ << " balancer recovery."
+ << causedBy(redact(statusWithDistLockHandle.getStatus()));
return;
}
}
@@ -393,9 +400,9 @@ void MigrationManager::finishRecovery(OperationContext* txn,
// This shouldn't happen because the collection was intact and sharded when the previous
// config primary was active and the dist locks have been held by the balancer
// throughout. Abort migration recovery.
- warning() << "Unable to reload chunk metadata for collection '" << nss
- << "' during balancer recovery. Abandoning recovery."
- << causedBy(redact(scopedCMStatus.getStatus()));
+ log() << "Unable to reload chunk metadata for collection '" << nss
+ << "' during balancer recovery. Abandoning recovery."
+ << causedBy(redact(scopedCMStatus.getStatus()));
return;
}
@@ -793,6 +800,15 @@ Status MigrationManager::_processRemoteCommandResponse(
return commandStatus;
}
+Status MigrationManager::tryTakeBalancerLock(OperationContext* txn, StringData whyMessage) {
+ return Grid::get(txn)
+ ->catalogClient(txn)
+ ->getDistLockManager()
+ ->lockWithSessionID(
+ txn, "balancer", whyMessage, _lockSessionID, DistLockManager::kSingleLockAttemptTimeout)
+ .getStatus();
+}
+
MigrationManager::Migration::Migration(NamespaceString inNss, BSONObj inMoveChunkCmdObj)
: nss(std::move(inNss)),
moveChunkCmdObj(std::move(inMoveChunkCmdObj)),
diff --git a/src/mongo/db/s/balancer/migration_manager.h b/src/mongo/db/s/balancer/migration_manager.h
index 8bdc9fdd401..b6ae1dd3ff3 100644
--- a/src/mongo/db/s/balancer/migration_manager.h
+++ b/src/mongo/db/s/balancer/migration_manager.h
@@ -102,8 +102,9 @@ public:
/**
* Non-blocking method that puts the migration manager in the kRecovering state, in which
- * new migration requests will block until finishRecovery is called. Then does local writes to
- * reacquire the distributed locks for active migrations.
+ * new migration requests will block until finishRecovery is called. Then reacquires distributed
+ * locks for the balancer and any active migrations. The distributed locks are taken with local
+ * write concern, since this is called in drain mode where majority writes are not yet possible.
*
* The active migration recovery may fail and be abandoned, setting the state to kEnabled.
*/
@@ -138,6 +139,11 @@ public:
*/
void drainActiveMigrations();
+ /**
+ * Tries to take or take over the balancer distributed lock.
+ */
+ Status tryTakeBalancerLock(OperationContext* txn, StringData whyMessage);
+
private:
// The current state of the migration manager
enum class State { // Allowed transitions:
@@ -282,7 +288,7 @@ private:
// Used as a constant session ID for all distributed locks that this MigrationManager holds.
// Currently required so that locks can be reacquired for the balancer in startRecovery and then
// overtaken in later operations.
- OID _lockSessionID;
+ const OID _lockSessionID{OID::gen()};
// Carries migration information over from startRecovery to finishRecovery. Should only be set
// in startRecovery and then accessed in finishRecovery.
diff --git a/src/mongo/db/s/balancer/migration_manager_test.cpp b/src/mongo/db/s/balancer/migration_manager_test.cpp
index 9e6b8aa1eac..1c5ead4acbf 100644
--- a/src/mongo/db/s/balancer/migration_manager_test.cpp
+++ b/src/mongo/db/s/balancer/migration_manager_test.cpp
@@ -116,8 +116,9 @@ protected:
void setUpMigration(const ChunkType& chunk, const ShardId& toShard);
/**
- * Asserts that config.migrations is empty and config.locks contains no locked documents, both
- * of which should be true if the MigrationManager is inactive and behaving properly.
+ * Asserts that config.migrations is empty and config.locks contains no locked documents other
+ * than the balancer's, both of which should be true if the MigrationManager is inactive and
+ * behaving properly.
*/
void checkMigrationsCollectionIsEmptyAndLocksAreUnlocked();
@@ -250,7 +251,7 @@ void MigrationManagerTest::checkMigrationsCollectionIsEmptyAndLocksAreUnlocked()
ReadPreferenceSetting{ReadPreference::PrimaryOnly},
repl::ReadConcernLevel::kMajorityReadConcern,
NamespaceString(LocksType::ConfigNS),
- BSON(LocksType::state(LocksType::LOCKED)),
+ BSON(LocksType::state(LocksType::LOCKED) << LocksType::name("{ '$ne' : 'balancer'}")),
BSONObj(),
boost::none);
Shard::QueryResponse locksQueryResponse = uassertStatusOK(statusWithLocksQueryResponse);
@@ -873,6 +874,10 @@ TEST_F(MigrationManagerTest, MigrationRecovery) {
setUpMigration(chunk1, kShardId1.toString());
setUpMigration(chunk2, kShardId3.toString());
+ // Mimic all config distlocks being released on config server stepup to primary.
+ auto distLockManager = catalogClient()->getDistLockManager();
+ distLockManager->unlockAll(operationContext(), distLockManager->getProcessID());
+
_migrationManager->startRecoveryAndAcquireDistLocks(operationContext());
auto future = launchAsync([this] {