diff options
Diffstat (limited to 'src/mongo/db/s/balancer')
-rw-r--r-- | src/mongo/db/s/balancer/migration_manager.cpp | 188 | ||||
-rw-r--r-- | src/mongo/db/s/balancer/migration_manager.h | 36 | ||||
-rw-r--r-- | src/mongo/db/s/balancer/migration_manager_test.cpp | 257 |
3 files changed, 58 insertions, 423 deletions
diff --git a/src/mongo/db/s/balancer/migration_manager.cpp b/src/mongo/db/s/balancer/migration_manager.cpp index bd8465338f0..6bccabf7326 100644 --- a/src/mongo/db/s/balancer/migration_manager.cpp +++ b/src/mongo/db/s/balancer/migration_manager.cpp @@ -61,15 +61,17 @@ using str::stream; namespace { -const char kChunkTooBig[] = "chunkTooBig"; +const char kChunkTooBig[] = "chunkTooBig"; // TODO: delete in 3.8 const WriteConcernOptions kMajorityWriteConcern(WriteConcernOptions::kMajority, WriteConcernOptions::SyncMode::UNSET, Seconds(15)); /** * Parses the 'commandResponse' and converts it to a status to use as the outcome of the command. - * Preserves backwards compatibility with 3.2 and earlier shards that, rather than use a ChunkTooBig + * Preserves backwards compatibility with 3.4 and earlier shards that, rather than use a ChunkTooBig * error code, include an extra field in the response. + * + * TODO: Delete in 3.8 */ Status extractMigrationStatusFromCommandResponse(const BSONObj& commandResponse) { Status commandStatus = getStatusFromCommandResult(commandResponse); @@ -98,23 +100,13 @@ StatusWith<DistLockHandle> acquireDistLock(OperationContext* txn, txn, nss.ns(), whyMessage, lockSessionID, DistLockManager::kSingleLockAttemptTimeout); if (!statusWithDistLockHandle.isOK()) { - // If we get LockBusy while trying to acquire the collection distributed lock, this implies - // that a concurrent collection operation is running either on a 3.2 shard or on mongos. - // Convert it to ConflictingOperationInProgress to better indicate the error. - // - // In addition, the code which re-schedules parallel migrations serially for 3.2 shard - // compatibility uses the LockBusy code as a hint to do the reschedule. - const ErrorCodes::Error code = (statusWithDistLockHandle == ErrorCodes::LockBusy - ? ErrorCodes::ConflictingOperationInProgress - : statusWithDistLockHandle.getStatus().code()); - - return {code, + return {statusWithDistLockHandle.getStatus().code(), stream() << "Could not acquire collection lock for " << nss.ns() << " to migrate chunks, due to " << statusWithDistLockHandle.getStatus().reason()}; } - return std::move(statusWithDistLockHandle.getValue()); + return statusWithDistLockHandle; } /** @@ -134,7 +126,7 @@ MigrationManager::MigrationManager(ServiceContext* serviceContext) MigrationManager::~MigrationManager() { // The migration manager must be completely quiesced at destruction time - invariant(_activeMigrationsWithoutDistLock.empty()); + invariant(_activeMigrations.empty()); } MigrationStatuses MigrationManager::executeMigrationsForAutoBalance( @@ -146,8 +138,6 @@ MigrationStatuses MigrationManager::executeMigrationsForAutoBalance( MigrationStatuses migrationStatuses; - vector<MigrateInfo> rescheduledMigrations; - { std::map<MigrationIdentifier, ScopedMigrationRequest> scopedMigrationRequests; vector<std::pair<shared_ptr<Notification<RemoteCommandResponse>>, MigrateInfo>> responses; @@ -165,18 +155,12 @@ MigrationStatuses MigrationManager::executeMigrationsForAutoBalance( scopedMigrationRequests.emplace(migrateInfo.getName(), std::move(statusWithScopedMigrationRequest.getValue())); - responses.emplace_back(_schedule(txn, - migrateInfo, - false, // Config server takes the collection dist lock - maxChunkSizeBytes, - secondaryThrottle, - waitForDelete), - migrateInfo); + responses.emplace_back( + _schedule(txn, migrateInfo, maxChunkSizeBytes, secondaryThrottle, waitForDelete), + migrateInfo); } - // Wait for all the scheduled migrations to complete and note the ones, which failed with a - // LockBusy error code. These need to be executed serially, without the distributed lock - // being held by the config server for backwards compatibility with 3.2 shards. + // Wait for all the scheduled migrations to complete. for (auto& response : responses) { auto notification = std::move(response.first); auto migrateInfo = std::move(response.second); @@ -187,39 +171,8 @@ MigrationStatuses MigrationManager::executeMigrationsForAutoBalance( invariant(it != scopedMigrationRequests.end()); Status commandStatus = _processRemoteCommandResponse(remoteCommandResponse, &it->second); - if (commandStatus == ErrorCodes::LockBusy) { - rescheduledMigrations.emplace_back(std::move(migrateInfo)); - } else { - migrationStatuses.emplace(migrateInfo.getName(), std::move(commandStatus)); - } - } - } - - // Schedule all 3.2 compatibility migrations sequentially - for (const auto& migrateInfo : rescheduledMigrations) { - // Write a document to the config.migrations collection, in case this migration must be - // recovered by the Balancer. Fail if the chunk is already moving. - auto statusWithScopedMigrationRequest = - ScopedMigrationRequest::writeMigration(txn, migrateInfo, waitForDelete); - if (!statusWithScopedMigrationRequest.isOK()) { - migrationStatuses.emplace(migrateInfo.getName(), - std::move(statusWithScopedMigrationRequest.getStatus())); - continue; + migrationStatuses.emplace(migrateInfo.getName(), std::move(commandStatus)); } - - RemoteCommandResponse remoteCommandResponse = - _schedule(txn, - migrateInfo, - true, // Shard takes the collection dist lock - maxChunkSizeBytes, - secondaryThrottle, - waitForDelete) - ->get(); - - Status commandStatus = _processRemoteCommandResponse( - remoteCommandResponse, &statusWithScopedMigrationRequest.getValue()); - - migrationStatuses.emplace(migrateInfo.getName(), std::move(commandStatus)); } invariant(migrationStatuses.size() == migrateInfos.size()); @@ -244,13 +197,7 @@ Status MigrationManager::executeManualMigration( } RemoteCommandResponse remoteCommandResponse = - _schedule(txn, - migrateInfo, - false, // Config server takes the collection dist lock - maxChunkSizeBytes, - secondaryThrottle, - waitForDelete) - ->get(); + _schedule(txn, migrateInfo, maxChunkSizeBytes, secondaryThrottle, waitForDelete)->get(); auto scopedCMStatus = ScopedChunkManager::refreshAndGet(txn, NamespaceString(migrateInfo.ns)); if (!scopedCMStatus.isOK()) { @@ -343,10 +290,7 @@ void MigrationManager::startRecoveryAndAcquireDistLocks(OperationContext* txn) { auto statusWithDistLockHandle = distLockManager->tryLockWithLocalWriteConcern( txn, migrateType.getNss().ns(), whyMessage, _lockSessionID); - if (!statusWithDistLockHandle.isOK() && - statusWithDistLockHandle.getStatus() != ErrorCodes::LockBusy) { - // LockBusy is alright because that should mean a 3.2 shard has it for the active - // migration. + if (!statusWithDistLockHandle.isOK()) { log() << "Failed to acquire distributed lock for collection '" << migrateType.getNss().ns() << "' during balancer recovery of an active migration. Abandoning" @@ -432,12 +376,8 @@ void MigrationManager::finishRecovery(OperationContext* txn, scheduledMigrations++; - responses.emplace_back(_schedule(txn, - migrationInfo, - false, // Config server takes the collection dist lock - maxChunkSizeBytes, - secondaryThrottle, - waitForDelete)); + responses.emplace_back( + _schedule(txn, migrationInfo, maxChunkSizeBytes, secondaryThrottle, waitForDelete)); } // If no migrations were scheduled for this namespace, free the dist lock @@ -473,7 +413,7 @@ void MigrationManager::interruptAndDisableMigrations() { _state = State::kStopping; // Interrupt any active migrations with dist lock - for (auto& cmsEntry : _activeMigrationsWithDistLock) { + for (auto& cmsEntry : _activeMigrations) { auto* cms = &cmsEntry.second; for (auto& migration : cms->migrations) { @@ -483,13 +423,6 @@ void MigrationManager::interruptAndDisableMigrations() { } } - // Interrupt any active migrations without dist lock - for (auto& migration : _activeMigrationsWithoutDistLock) { - if (migration.callbackHandle) { - executor->cancel(*migration.callbackHandle); - } - } - _checkDrained_inlock(); } @@ -499,18 +432,13 @@ void MigrationManager::drainActiveMigrations() { if (_state == State::kStopped) return; invariant(_state == State::kStopping); - - _condVar.wait(lock, [this] { - return _activeMigrationsWithDistLock.empty() && _activeMigrationsWithoutDistLock.empty(); - }); - + _condVar.wait(lock, [this] { return _activeMigrations.empty(); }); _state = State::kStopped; } shared_ptr<Notification<RemoteCommandResponse>> MigrationManager::_schedule( OperationContext* txn, const MigrateInfo& migrateInfo, - bool shardTakesCollectionDistLock, uint64_t maxChunkSizeBytes, const MigrationSecondaryThrottleOptions& secondaryThrottle, bool waitForDelete) { @@ -575,8 +503,7 @@ shared_ptr<Notification<RemoteCommandResponse>> MigrationManager::_schedule( chunk->getLastmod(), maxChunkSizeBytes, secondaryThrottle, - waitForDelete, - shardTakesCollectionDistLock); + waitForDelete); stdx::lock_guard<stdx::mutex> lock(_mutex); @@ -590,24 +517,20 @@ shared_ptr<Notification<RemoteCommandResponse>> MigrationManager::_schedule( auto retVal = migration.completionNotification; - if (shardTakesCollectionDistLock) { - _scheduleWithoutDistLock_inlock(txn, fromHostStatus.getValue(), std::move(migration)); - } else { - _scheduleWithDistLock_inlock(txn, fromHostStatus.getValue(), std::move(migration)); - } + _schedule_inlock(txn, fromHostStatus.getValue(), std::move(migration)); return retVal; } -void MigrationManager::_scheduleWithDistLock_inlock(OperationContext* txn, - const HostAndPort& targetHost, - Migration migration) { +void MigrationManager::_schedule_inlock(OperationContext* txn, + const HostAndPort& targetHost, + Migration migration) { executor::TaskExecutor* const executor = Grid::get(txn)->getExecutorPool()->getFixedExecutor(); const NamespaceString nss(migration.nss); - auto it = _activeMigrationsWithDistLock.find(nss); - if (it == _activeMigrationsWithDistLock.end()) { + auto it = _activeMigrations.find(nss); + if (it == _activeMigrations.end()) { // Acquire the collection distributed lock (blocking call) auto distLockHandleStatus = acquireDistLock(txn, _lockSessionID, nss); if (!distLockHandleStatus.isOK()) { @@ -615,7 +538,7 @@ void MigrationManager::_scheduleWithDistLock_inlock(OperationContext* txn, return; } - it = _activeMigrationsWithDistLock + it = _activeMigrations .insert(std::make_pair( nss, CollectionMigrationsState(std::move(distLockHandleStatus.getValue())))) .first; @@ -640,7 +563,7 @@ void MigrationManager::_scheduleWithDistLock_inlock(OperationContext* txn, auto txn = cc().makeOperationContext(); stdx::lock_guard<stdx::mutex> lock(_mutex); - _completeWithDistLock_inlock(txn.get(), itMigration, args.response); + _complete_inlock(txn.get(), itMigration, args.response); }); if (callbackHandleWithStatus.isOK()) { @@ -648,13 +571,12 @@ void MigrationManager::_scheduleWithDistLock_inlock(OperationContext* txn, return; } - _completeWithDistLock_inlock(txn, itMigration, std::move(callbackHandleWithStatus.getStatus())); + _complete_inlock(txn, itMigration, std::move(callbackHandleWithStatus.getStatus())); } -void MigrationManager::_completeWithDistLock_inlock( - OperationContext* txn, - MigrationsList::iterator itMigration, - const RemoteCommandResponse& remoteCommandResponse) { +void MigrationManager::_complete_inlock(OperationContext* txn, + MigrationsList::iterator itMigration, + const RemoteCommandResponse& remoteCommandResponse) { const NamespaceString nss(itMigration->nss); // Make sure to signal the notification last, after the distributed lock is freed, so that we @@ -662,8 +584,8 @@ void MigrationManager::_completeWithDistLock_inlock( // still acquired. auto notificationToSignal = itMigration->completionNotification; - auto it = _activeMigrationsWithDistLock.find(nss); - invariant(it != _activeMigrationsWithDistLock.end()); + auto it = _activeMigrations.find(nss); + invariant(it != _activeMigrations.end()); auto collectionMigrationState = &it->second; collectionMigrationState->migrations.erase(itMigration); @@ -671,58 +593,20 @@ void MigrationManager::_completeWithDistLock_inlock( if (collectionMigrationState->migrations.empty()) { Grid::get(txn)->catalogClient(txn)->getDistLockManager()->unlock( txn, collectionMigrationState->distLockHandle, nss.ns()); - _activeMigrationsWithDistLock.erase(it); + _activeMigrations.erase(it); _checkDrained_inlock(); } notificationToSignal->set(remoteCommandResponse); } -void MigrationManager::_scheduleWithoutDistLock_inlock(OperationContext* txn, - const HostAndPort& targetHost, - Migration migration) { - executor::TaskExecutor* const executor = Grid::get(txn)->getExecutorPool()->getFixedExecutor(); - - _activeMigrationsWithoutDistLock.push_front(std::move(migration)); - auto itMigration = _activeMigrationsWithoutDistLock.begin(); - - const RemoteCommandRequest remoteRequest( - targetHost, NamespaceString::kAdminDb.toString(), itMigration->moveChunkCmdObj, txn); - - StatusWith<executor::TaskExecutor::CallbackHandle> callbackHandleWithStatus = - executor->scheduleRemoteCommand( - remoteRequest, - [this, itMigration](const executor::TaskExecutor::RemoteCommandCallbackArgs& args) { - auto notificationToSignal = itMigration->completionNotification; - - stdx::lock_guard<stdx::mutex> lock(_mutex); - - _activeMigrationsWithoutDistLock.erase(itMigration); - _checkDrained_inlock(); - - notificationToSignal->set(args.response); - }); - - if (callbackHandleWithStatus.isOK()) { - itMigration->callbackHandle = std::move(callbackHandleWithStatus.getValue()); - return; - } - - auto notificationToSignal = itMigration->completionNotification; - - _activeMigrationsWithoutDistLock.erase(itMigration); - _checkDrained_inlock(); - - notificationToSignal->set(std::move(callbackHandleWithStatus.getStatus())); -} - void MigrationManager::_checkDrained_inlock() { if (_state == State::kEnabled || _state == State::kRecovering) { return; } invariant(_state == State::kStopping); - if (_activeMigrationsWithDistLock.empty() && _activeMigrationsWithoutDistLock.empty()) { + if (_activeMigrations.empty()) { _condVar.notify_all(); } } @@ -758,6 +642,7 @@ void MigrationManager::_abandonActiveMigrationsAndEnableManager(OperationContext Status MigrationManager::_processRemoteCommandResponse( const RemoteCommandResponse& remoteCommandResponse, ScopedMigrationRequest* scopedMigrationRequest) { + stdx::lock_guard<stdx::mutex> lock(_mutex); Status commandStatus(ErrorCodes::InternalError, "Uninitialized value."); @@ -774,6 +659,7 @@ Status MigrationManager::_processRemoteCommandResponse( if (!remoteCommandResponse.isOK()) { commandStatus = remoteCommandResponse.status; } else { + // TODO: delete in 3.8 commandStatus = extractMigrationStatusFromCommandResponse(remoteCommandResponse.data); } diff --git a/src/mongo/db/s/balancer/migration_manager.h b/src/mongo/db/s/balancer/migration_manager.h index b6ae1dd3ff3..b9611498148 100644 --- a/src/mongo/db/s/balancer/migration_manager.h +++ b/src/mongo/db/s/balancer/migration_manager.h @@ -60,8 +60,6 @@ typedef std::map<MigrationIdentifier, Status> MigrationStatuses; /** * Manages and executes parallel migrations for the balancer. - * - * TODO: for v3.6, remove code making compatible with v3.2 shards that take distlock. */ class MigrationManager { MONGO_DISALLOW_COPYING(MigrationManager); @@ -202,14 +200,10 @@ private: * specified parameters. May block for distributed lock acquisition. If dist lock acquisition is * successful (or not done), schedules the migration request and returns a notification which * can be used to obtain the outcome of the operation. - * - * The 'shardTakesCollectionDistLock' parameter controls whether the distributed lock is - * acquired by the migration manager or by the shard executing the migration request. */ std::shared_ptr<Notification<executor::RemoteCommandResponse>> _schedule( OperationContext* txn, const MigrateInfo& migrateInfo, - bool shardTakesCollectionDistLock, uint64_t maxChunkSizeBytes, const MigrationSecondaryThrottleOptions& secondaryThrottle, bool waitForDelete); @@ -221,9 +215,9 @@ private: * The distributed lock is acquired before scheduling the first migration for the collection and * is only released when all active migrations on the collection have finished. */ - void _scheduleWithDistLock_inlock(OperationContext* txn, - const HostAndPort& targetHost, - Migration migration); + void _schedule_inlock(OperationContext* txn, + const HostAndPort& targetHost, + Migration migration); /** * Used internally for migrations scheduled with the distributed lock acquired by the config @@ -231,21 +225,9 @@ private: * passed iterator and if this is the last migration for the collection will free the collection * distributed lock. */ - void _completeWithDistLock_inlock(OperationContext* txn, - MigrationsList::iterator itMigration, - const executor::RemoteCommandResponse& remoteCommandResponse); - - /** - * Immediately schedules the specified migration without attempting to acquire the collection - * distributed lock or checking that it is not being held. - * - * This method is only used for retrying migrations that have failed with LockBusy errors - * returned by the shard, which only happens with legacy 3.2 shards that take the collection - * distributed lock themselves. - */ - void _scheduleWithoutDistLock_inlock(OperationContext* txn, - const HostAndPort& targetHost, - Migration migration); + void _complete_inlock(OperationContext* txn, + MigrationsList::iterator itMigration, + const executor::RemoteCommandResponse& remoteCommandResponse); /** * If the state of the migration manager is kStopping, checks whether there are any outstanding @@ -306,11 +288,7 @@ private: // Holds information about each collection's distributed lock and active migrations via a // CollectionMigrationState object. - CollectionMigrationsStateMap _activeMigrationsWithDistLock; - - // Holds information about migrations, which have been scheduled without the collection - // distributed lock acquired (i.e., the shard is asked to acquire it). - MigrationsList _activeMigrationsWithoutDistLock; + CollectionMigrationsStateMap _activeMigrations; }; } // namespace mongo diff --git a/src/mongo/db/s/balancer/migration_manager_test.cpp b/src/mongo/db/s/balancer/migration_manager_test.cpp index 1c5ead4acbf..bcca90e4e25 100644 --- a/src/mongo/db/s/balancer/migration_manager_test.cpp +++ b/src/mongo/db/s/balancer/migration_manager_test.cpp @@ -128,11 +128,9 @@ protected: */ void expectMoveChunkCommand(const ChunkType& chunk, const ShardId& toShardId, - const bool& takeDistLock, const BSONObj& response); void expectMoveChunkCommand(const ChunkType& chunk, const ShardId& toShardId, - const bool& takeDistLock, const Status& returnStatus); // Random static initialization order can result in X constructor running before Y constructor @@ -260,9 +258,8 @@ void MigrationManagerTest::checkMigrationsCollectionIsEmptyAndLocksAreUnlocked() void MigrationManagerTest::expectMoveChunkCommand(const ChunkType& chunk, const ShardId& toShardId, - const bool& takeDistLock, const BSONObj& response) { - onCommand([&chunk, &toShardId, &takeDistLock, &response](const RemoteCommandRequest& request) { + onCommand([&chunk, &toShardId, &response](const RemoteCommandRequest& request) { NamespaceString nss(request.cmdObj.firstElement().valueStringData()); ASSERT_EQ(chunk.getNS(), nss.ns()); @@ -276,7 +273,6 @@ void MigrationManagerTest::expectMoveChunkCommand(const ChunkType& chunk, ASSERT_EQ(chunk.getShard(), moveChunkRequestWithStatus.getValue().getFromShardId()); ASSERT_EQ(toShardId, moveChunkRequestWithStatus.getValue().getToShardId()); - ASSERT_EQ(takeDistLock, moveChunkRequestWithStatus.getValue().getTakeDistLock()); return response; }); @@ -284,11 +280,10 @@ void MigrationManagerTest::expectMoveChunkCommand(const ChunkType& chunk, void MigrationManagerTest::expectMoveChunkCommand(const ChunkType& chunk, const ShardId& toShardId, - const bool& takeDistLock, const Status& returnStatus) { BSONObjBuilder resultBuilder; Command::appendCommandStatus(resultBuilder, returnStatus); - expectMoveChunkCommand(chunk, toShardId, takeDistLock, resultBuilder.obj()); + expectMoveChunkCommand(chunk, toShardId, resultBuilder.obj()); } TEST_F(MigrationManagerTest, OneCollectionTwoMigrations) { @@ -334,8 +329,8 @@ TEST_F(MigrationManagerTest, OneCollectionTwoMigrations) { }); // Expect two moveChunk commands. - expectMoveChunkCommand(chunk1, kShardId1, false, Status::OK()); - expectMoveChunkCommand(chunk2, kShardId3, false, Status::OK()); + expectMoveChunkCommand(chunk1, kShardId1, Status::OK()); + expectMoveChunkCommand(chunk2, kShardId3, Status::OK()); // Run the MigrationManager code. future.timed_get(kFutureTimeout); @@ -396,238 +391,15 @@ TEST_F(MigrationManagerTest, TwoCollectionsTwoMigrationsEach) { }); // Expect four moveChunk commands. - expectMoveChunkCommand(chunk1coll1, kShardId1, false, Status::OK()); - expectMoveChunkCommand(chunk2coll1, kShardId3, false, Status::OK()); - expectMoveChunkCommand(chunk1coll2, kShardId1, false, Status::OK()); - expectMoveChunkCommand(chunk2coll2, kShardId3, false, Status::OK()); + expectMoveChunkCommand(chunk1coll1, kShardId1, Status::OK()); + expectMoveChunkCommand(chunk2coll1, kShardId3, Status::OK()); + expectMoveChunkCommand(chunk1coll2, kShardId1, Status::OK()); + expectMoveChunkCommand(chunk2coll2, kShardId3, Status::OK()); // Run the MigrationManager code. future.timed_get(kFutureTimeout); } -// Old v3.2 shards expect to take the distributed lock before executing a moveChunk command. The -// MigrationManager should take the distlock and fail the first moveChunk command with an old shard, -// and then release the lock and retry the command successfully. -TEST_F(MigrationManagerTest, SameCollectionOldShardMigration) { - // Set up two shards in the metadata. - ASSERT_OK(catalogClient()->insertConfigDocument( - operationContext(), ShardType::ConfigNS, kShard0, kMajorityWriteConcern)); - ASSERT_OK(catalogClient()->insertConfigDocument( - operationContext(), ShardType::ConfigNS, kShard2, kMajorityWriteConcern)); - - // Set up the database and collection as sharded in the metadata. - std::string dbName = "foo"; - std::string collName = "foo.bar"; - ChunkVersion version(2, 0, OID::gen()); - - setUpDatabase(dbName, kShardId0); - setUpCollection(collName, version); - - // Set up two chunks in the metadata. - ChunkType chunk1 = - setUpChunk(collName, kKeyPattern.globalMin(), BSON(kPattern << 49), kShardId0, version); - version.incMinor(); - ChunkType chunk2 = - setUpChunk(collName, BSON(kPattern << 49), kKeyPattern.globalMax(), kShardId2, version); - - // Going to request that these two chunks get migrated. - const std::vector<MigrateInfo> migrationRequests{{kShardId1, chunk1}, {kShardId3, chunk2}}; - - auto future = launchAsync([this, migrationRequests] { - Client::initThreadIfNotAlready("Test"); - auto txn = cc().makeOperationContext(); - - // Scheduling the moveChunk commands requires finding a host to which to send the command. - // Set up dummy hosts for the source shards. - shardTargeterMock(txn.get(), kShardId0)->setFindHostReturnValue(kShardHost0); - shardTargeterMock(txn.get(), kShardId2)->setFindHostReturnValue(kShardHost2); - - MigrationStatuses migrationStatuses = _migrationManager->executeMigrationsForAutoBalance( - txn.get(), migrationRequests, 0, kDefaultSecondaryThrottle, false); - - for (const auto& migrateInfo : migrationRequests) { - ASSERT_OK(migrationStatuses.at(migrateInfo.getName())); - } - }); - - // Expect two moveChunk commands. - expectMoveChunkCommand( - chunk1, - kShardId1, - false, - Status(ErrorCodes::LockBusy, "SameCollectionOldShardMigration generated error.")); - expectMoveChunkCommand(chunk2, kShardId3, false, Status::OK()); - expectMoveChunkCommand(chunk1, kShardId1, true, Status::OK()); - - // Run the MigrationManager code. - future.timed_get(kFutureTimeout); -} - -// Fail a migration if an old v3.2 shard fails to acquire the distributed lock more than once. The -// first LockBusy error identifies the shard as an old shard to the MigrationManager, the second -// indicates the lock is held elsewhere and unavailable. -TEST_F(MigrationManagerTest, SameOldShardFailsToAcquireDistributedLockTwice) { - // Set up a shard in the metadata. - ASSERT_OK(catalogClient()->insertConfigDocument( - operationContext(), ShardType::ConfigNS, kShard0, kMajorityWriteConcern)); - - // Set up the database and collection as sharded in the metadata. - std::string dbName = "foo"; - std::string collName = "foo.bar"; - ChunkVersion version(2, 0, OID::gen()); - - setUpDatabase(dbName, kShardId0); - setUpCollection(collName, version); - - // Set up a chunk in the metadata. - ChunkType chunk1 = - setUpChunk(collName, kKeyPattern.globalMin(), kKeyPattern.globalMax(), kShardId0, version); - - // Going to request that this chunk get migrated. - const std::vector<MigrateInfo> migrationRequests{{kShardId1, chunk1}}; - - auto future = launchAsync([this, migrationRequests] { - Client::initThreadIfNotAlready("Test"); - auto txn = cc().makeOperationContext(); - - // Scheduling the moveChunk commands requires finding a host to which to send the command. - // Set up a dummy host for the source shard. - shardTargeterMock(txn.get(), kShardId0)->setFindHostReturnValue(kShardHost0); - - MigrationStatuses migrationStatuses = _migrationManager->executeMigrationsForAutoBalance( - txn.get(), migrationRequests, 0, kDefaultSecondaryThrottle, false); - - for (const auto& migrateInfo : migrationRequests) { - ASSERT_EQ(ErrorCodes::LockBusy, migrationStatuses.at(migrateInfo.getName())); - } - }); - - // Expect two sequential moveChunk commands to the same shard, both of which fail with LockBusy. - expectMoveChunkCommand( - chunk1, - kShardId1, - false, - Status(ErrorCodes::LockBusy, "SameCollectionOldShardMigrations generated error.")); - expectMoveChunkCommand( - chunk1, - kShardId1, - true, - Status(ErrorCodes::LockBusy, "SameCollectionOldShardMigrations generated error.")); - - // Run the MigrationManager code. - future.timed_get(kFutureTimeout); -} - -// If in the same collection a migration is scheduled with an old v3.2 shard, a second migration in -// the collection with a different old v3.2 shard should get rescheduled. -TEST_F(MigrationManagerTest, SameCollectionTwoOldShardMigrations) { - // Set up two shards in the metadata. - ASSERT_OK(catalogClient()->insertConfigDocument( - operationContext(), ShardType::ConfigNS, kShard0, kMajorityWriteConcern)); - ASSERT_OK(catalogClient()->insertConfigDocument( - operationContext(), ShardType::ConfigNS, kShard2, kMajorityWriteConcern)); - - // Set up the database and collection as sharded in the metadata. - std::string dbName = "foo"; - std::string collName = "foo.bar"; - ChunkVersion version(2, 0, OID::gen()); - - setUpDatabase(dbName, kShardId0); - setUpCollection(collName, version); - - // Set up two chunks in the metadata. - ChunkType chunk1 = - setUpChunk(collName, kKeyPattern.globalMin(), BSON(kPattern << 49), kShardId0, version); - version.incMinor(); - ChunkType chunk2 = - setUpChunk(collName, BSON(kPattern << 49), kKeyPattern.globalMax(), kShardId2, version); - - // Going to request that these two chunks get migrated. - const std::vector<MigrateInfo> migrationRequests{{kShardId1, chunk1}, {kShardId3, chunk2}}; - - auto future = launchAsync([this, migrationRequests] { - Client::initThreadIfNotAlready("Test"); - auto txn = cc().makeOperationContext(); - - // Scheduling the moveChunk commands requires finding a host to which to send the command. - // Set up dummy hosts for the source shards. - shardTargeterMock(txn.get(), kShardId0)->setFindHostReturnValue(kShardHost0); - shardTargeterMock(txn.get(), kShardId2)->setFindHostReturnValue(kShardHost2); - - MigrationStatuses migrationStatuses = _migrationManager->executeMigrationsForAutoBalance( - txn.get(), migrationRequests, 0, kDefaultSecondaryThrottle, false); - - for (const auto& migrateInfo : migrationRequests) { - ASSERT_OK(migrationStatuses.at(migrateInfo.getName())); - } - }); - - // Expect two failed moveChunk commands, then two successful moveChunk commands after the - // balancer releases the distributed lock. - expectMoveChunkCommand( - chunk1, - kShardId1, - false, - Status(ErrorCodes::LockBusy, "SameCollectionOldShardMigration generated error.")); - expectMoveChunkCommand( - chunk2, - kShardId3, - false, - Status(ErrorCodes::LockBusy, "SameCollectionOldShardMigration generated error.")); - expectMoveChunkCommand(chunk1, kShardId1, true, Status::OK()); - expectMoveChunkCommand(chunk2, kShardId3, true, Status::OK()); - - // Run the MigrationManager code. - future.timed_get(kFutureTimeout); -} - -// Takes the distributed lock for a collection so that that the MigrationManager is unable to -// schedule migrations for that collection. -TEST_F(MigrationManagerTest, FailToAcquireDistributedLock) { - // Set up two shards in the metadata. - ASSERT_OK(catalogClient()->insertConfigDocument( - operationContext(), ShardType::ConfigNS, kShard0, kMajorityWriteConcern)); - ASSERT_OK(catalogClient()->insertConfigDocument( - operationContext(), ShardType::ConfigNS, kShard2, kMajorityWriteConcern)); - - // Set up the database and collection as sharded in the metadata. - std::string dbName = "foo"; - std::string collName = "foo.bar"; - ChunkVersion version(2, 0, OID::gen()); - - setUpDatabase(dbName, kShardId0); - setUpCollection(collName, version); - - // Set up two chunks in the metadata. - ChunkType chunk1 = - setUpChunk(collName, kKeyPattern.globalMin(), BSON(kPattern << 49), kShardId0, version); - version.incMinor(); - ChunkType chunk2 = - setUpChunk(collName, BSON(kPattern << 49), kKeyPattern.globalMax(), kShardId2, version); - - // Going to request that these two chunks get migrated. - const std::vector<MigrateInfo> migrationRequests{{kShardId1, chunk1}, {kShardId3, chunk2}}; - - shardTargeterMock(operationContext(), kShardId0)->setFindHostReturnValue(kShardHost0); - shardTargeterMock(operationContext(), kShardId2)->setFindHostReturnValue(kShardHost2); - - // Take the distributed lock for the collection before scheduling via the MigrationManager. - const std::string whyMessage("FailToAcquireDistributedLock unit-test taking distributed lock"); - DistLockManager::ScopedDistLock distLockStatus = assertGet( - catalogClient()->getDistLockManager()->lock(operationContext(), - chunk1.getNS(), - whyMessage, - DistLockManager::kSingleLockAttemptTimeout)); - - MigrationStatuses migrationStatuses = _migrationManager->executeMigrationsForAutoBalance( - operationContext(), migrationRequests, 0, kDefaultSecondaryThrottle, false); - - for (const auto& migrateInfo : migrationRequests) { - ASSERT_EQ(ErrorCodes::ConflictingOperationInProgress, - migrationStatuses.at(migrateInfo.getName())); - } -} - // The MigrationManager should fail the migration if a host is not found for the source shard. // Scheduling a moveChunk command requires finding a host to which to send the command. TEST_F(MigrationManagerTest, SourceShardNotFound) { @@ -674,12 +446,13 @@ TEST_F(MigrationManagerTest, SourceShardNotFound) { }); // Expect only one moveChunk command to be called. - expectMoveChunkCommand(chunk1, kShardId1, false, Status::OK()); + expectMoveChunkCommand(chunk1, kShardId1, Status::OK()); // Run the MigrationManager code. future.timed_get(kFutureTimeout); } +// TODO: Delete in 3.8 TEST_F(MigrationManagerTest, JumboChunkResponseBackwardsCompatibility) { // Set up one shard in the metadata. ASSERT_OK(catalogClient()->insertConfigDocument( @@ -715,7 +488,7 @@ TEST_F(MigrationManagerTest, JumboChunkResponseBackwardsCompatibility) { }); // Expect only one moveChunk command to be called. - expectMoveChunkCommand(chunk1, kShardId1, false, BSON("ok" << 0 << "chunkTooBig" << true)); + expectMoveChunkCommand(chunk1, kShardId1, BSON("ok" << 0 << "chunkTooBig" << true)); // Run the MigrationManager code. future.timed_get(kFutureTimeout); @@ -839,7 +612,7 @@ TEST_F(MigrationManagerTest, RestartMigrationManager) { }); // Expect only one moveChunk command to be called. - expectMoveChunkCommand(chunk1, kShardId1, false, Status::OK()); + expectMoveChunkCommand(chunk1, kShardId1, Status::OK()); // Run the MigrationManager code. future.timed_get(kFutureTimeout); @@ -893,8 +666,8 @@ TEST_F(MigrationManagerTest, MigrationRecovery) { }); // Expect two moveChunk commands. - expectMoveChunkCommand(chunk1, kShardId1, false, Status::OK()); - expectMoveChunkCommand(chunk2, kShardId3, false, Status::OK()); + expectMoveChunkCommand(chunk1, kShardId1, Status::OK()); + expectMoveChunkCommand(chunk2, kShardId3, Status::OK()); // Run the MigrationManager code. future.timed_get(kFutureTimeout); @@ -1005,7 +778,6 @@ TEST_F(MigrationManagerTest, RemoteCallErrorConversionToOperationFailed) { expectMoveChunkCommand( chunk1, kShardId1, - false, Status(ErrorCodes::NotMasterOrSecondary, "RemoteCallErrorConversionToOperationFailedCheck generated error.")); @@ -1013,7 +785,6 @@ TEST_F(MigrationManagerTest, RemoteCallErrorConversionToOperationFailed) { expectMoveChunkCommand( chunk2, kShardId3, - false, Status(ErrorCodes::ExceededTimeLimit, "RemoteCallErrorConversionToOperationFailedCheck generated error.")); |