summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Shuvalov <andrew.shuvalov@mongodb.com>2021-06-02 21:56:21 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-06-02 22:08:12 +0000
commitec0158de27c891599c2d442c0e8c92275b7d23b8 (patch)
treeaf428bab49cae0b5363ad39c81889f47dc38349e
parent8b311f78ee7f15cf30f3032edca0ab7403812098 (diff)
downloadmongo-ec0158de27c891599c2d442c0e8c92275b7d23b8.tar.gz
SERVER-57356: Better logging around failed chunk migration
-rw-r--r--src/mongo/db/s/balancer/migration_manager.cpp88
-rw-r--r--src/mongo/db/s/balancer/migration_manager.h16
2 files changed, 65 insertions, 39 deletions
diff --git a/src/mongo/db/s/balancer/migration_manager.cpp b/src/mongo/db/s/balancer/migration_manager.cpp
index 070f3f3debb..b37253a3537 100644
--- a/src/mongo/db/s/balancer/migration_manager.cpp
+++ b/src/mongo/db/s/balancer/migration_manager.cpp
@@ -124,7 +124,12 @@ MigrationStatuses MigrationManager::executeMigrationsForAutoBalance(
{
std::map<MigrationIdentifier, ScopedMigrationRequest> scopedMigrationRequests;
- vector<std::pair<shared_ptr<Notification<RemoteCommandResponse>>, MigrateInfo>> responses;
+ struct Response {
+ std::pair<shared_ptr<Notification<RemoteCommandResponse>>, boost::optional<HostAndPort>>
+ response;
+ MigrateInfo migrateInfo;
+ };
+ vector<Response> responses;
for (const auto& migrateInfo : migrateInfos) {
// Write a document to the config.migrations collection, in case this migration must be
@@ -139,22 +144,23 @@ MigrationStatuses MigrationManager::executeMigrationsForAutoBalance(
scopedMigrationRequests.emplace(migrateInfo.getName(),
std::move(statusWithScopedMigrationRequest.getValue()));
- responses.emplace_back(
+ responses.emplace_back(Response{
_schedule(opCtx, migrateInfo, maxChunkSizeBytes, secondaryThrottle, waitForDelete),
- migrateInfo);
+ migrateInfo});
}
// Wait for all the scheduled migrations to complete.
for (auto& response : responses) {
- auto notification = std::move(response.first);
- auto migrateInfo = std::move(response.second);
+ auto notification = std::move(response.response.first);
+ auto remoteHost = std::move(response.response.second);
+ auto migrateInfo = std::move(response.migrateInfo);
const auto& remoteCommandResponse = notification->get();
auto it = scopedMigrationRequests.find(migrateInfo.getName());
invariant(it != scopedMigrationRequests.end());
Status commandStatus =
- _processRemoteCommandResponse(remoteCommandResponse, &it->second);
+ _processRemoteCommandResponse(remoteCommandResponse, &it->second, remoteHost);
migrationStatuses.emplace(migrateInfo.getName(), std::move(commandStatus));
}
}
@@ -180,8 +186,10 @@ Status MigrationManager::executeManualMigration(
return statusWithScopedMigrationRequest.getStatus();
}
- RemoteCommandResponse remoteCommandResponse =
- _schedule(opCtx, migrateInfo, maxChunkSizeBytes, secondaryThrottle, waitForDelete)->get();
+ auto remoteCommandResponsePair =
+ _schedule(opCtx, migrateInfo, maxChunkSizeBytes, secondaryThrottle, waitForDelete);
+ RemoteCommandResponse remoteCommandResponse = remoteCommandResponsePair.first->get();
+ const boost::optional<HostAndPort>& remoteHost = remoteCommandResponsePair.second;
auto routingInfoStatus =
Grid::get(opCtx)->catalogCache()->getShardedCollectionRoutingInfoWithRefresh(
@@ -196,7 +204,7 @@ Status MigrationManager::executeManualMigration(
routingInfo.cm()->findIntersectingChunkWithSimpleCollation(migrateInfo.minKey);
Status commandStatus = _processRemoteCommandResponse(
- remoteCommandResponse, &statusWithScopedMigrationRequest.getValue());
+ remoteCommandResponse, &statusWithScopedMigrationRequest.getValue(), remoteHost);
// Migration calls can be interrupted after the metadata is committed but before the command
// finishes the waitForDelete stage. Any failovers, therefore, must always cause the moveChunk
@@ -353,8 +361,9 @@ void MigrationManager::finishRecovery(OperationContext* opCtx,
scheduledMigrations++;
- responses.emplace_back(_schedule(
- opCtx, migrationInfo, maxChunkSizeBytes, secondaryThrottle, waitForDelete));
+ responses.emplace_back(
+ _schedule(opCtx, migrationInfo, maxChunkSizeBytes, secondaryThrottle, waitForDelete)
+ .first);
}
// If no migrations were scheduled for this namespace, free the dist lock
@@ -413,37 +422,41 @@ void MigrationManager::drainActiveMigrations() {
_state = State::kStopped;
}
-shared_ptr<Notification<RemoteCommandResponse>> MigrationManager::_schedule(
- OperationContext* opCtx,
- const MigrateInfo& migrateInfo,
- uint64_t maxChunkSizeBytes,
- const MigrationSecondaryThrottleOptions& secondaryThrottle,
- bool waitForDelete) {
+std::pair<shared_ptr<Notification<RemoteCommandResponse>>, boost::optional<HostAndPort>>
+MigrationManager::_schedule(OperationContext* opCtx,
+ const MigrateInfo& migrateInfo,
+ uint64_t maxChunkSizeBytes,
+ const MigrationSecondaryThrottleOptions& secondaryThrottle,
+ bool waitForDelete) {
const NamespaceString& nss = migrateInfo.nss;
// Ensure we are not stopped in order to avoid doing the extra work
{
stdx::lock_guard<stdx::mutex> lock(_mutex);
if (_state != State::kEnabled && _state != State::kRecovering) {
- return std::make_shared<Notification<RemoteCommandResponse>>(
- Status(ErrorCodes::BalancerInterrupted,
- "Migration cannot be executed because the balancer is not running"));
+ return std::make_pair(
+ std::make_shared<Notification<RemoteCommandResponse>>(
+ Status(ErrorCodes::BalancerInterrupted,
+ "Migration cannot be executed because the balancer is not running")),
+ boost::none);
}
}
const auto fromShardStatus =
Grid::get(opCtx)->shardRegistry()->getShard(opCtx, migrateInfo.from);
if (!fromShardStatus.isOK()) {
- return std::make_shared<Notification<RemoteCommandResponse>>(
- std::move(fromShardStatus.getStatus()));
+ return std::make_pair(std::make_shared<Notification<RemoteCommandResponse>>(
+ std::move(fromShardStatus.getStatus())),
+ boost::none);
}
const auto fromShard = fromShardStatus.getValue();
auto fromHostStatus = fromShard->getTargeter()->findHost(
opCtx, ReadPreferenceSetting{ReadPreference::PrimaryOnly});
if (!fromHostStatus.isOK()) {
- return std::make_shared<Notification<RemoteCommandResponse>>(
- std::move(fromHostStatus.getStatus()));
+ return std::make_pair(std::make_shared<Notification<RemoteCommandResponse>>(
+ std::move(fromHostStatus.getStatus())),
+ boost::none);
}
BSONObjBuilder builder;
@@ -462,14 +475,16 @@ shared_ptr<Notification<RemoteCommandResponse>> MigrationManager::_schedule(
stdx::lock_guard<stdx::mutex> lock(_mutex);
if (_state != State::kEnabled && _state != State::kRecovering) {
- return std::make_shared<Notification<RemoteCommandResponse>>(
- Status(ErrorCodes::BalancerInterrupted,
- "Migration cannot be executed because the balancer is not running"));
+ return std::make_pair(
+ std::make_shared<Notification<RemoteCommandResponse>>(
+ Status(ErrorCodes::BalancerInterrupted,
+ "Migration cannot be executed because the balancer is not running")),
+ boost::none);
}
Migration migration(nss, builder.obj());
- auto retVal = migration.completionNotification;
+ auto retVal = std::make_pair(migration.completionNotification, fromHostStatus.getValue());
_schedule(lock, opCtx, fromHostStatus.getValue(), std::move(migration));
@@ -606,10 +621,13 @@ void MigrationManager::_abandonActiveMigrationsAndEnableManager(OperationContext
Status MigrationManager::_processRemoteCommandResponse(
const RemoteCommandResponse& remoteCommandResponse,
- ScopedMigrationRequest* scopedMigrationRequest) {
+ ScopedMigrationRequest* scopedMigrationRequest,
+ const boost::optional<HostAndPort>& remoteHost) {
stdx::lock_guard<stdx::mutex> lock(_mutex);
Status commandStatus(ErrorCodes::InternalError, "Uninitialized value.");
+ auto remoteHostLog =
+ remoteHost ? std::string(" from ") + remoteHost->toString() : std::string();
// Check for local errors sending the remote command caused by stepdown.
if (isErrorDueToConfigStepdown(remoteCommandResponse.status,
@@ -618,7 +636,8 @@ Status MigrationManager::_processRemoteCommandResponse(
return {ErrorCodes::BalancerInterrupted,
stream() << "Migration interrupted because the balancer is stopping."
<< " Command status: "
- << remoteCommandResponse.status.toString()};
+ << remoteCommandResponse.status.toString()
+ << remoteHostLog};
}
if (!remoteCommandResponse.isOK()) {
@@ -631,7 +650,8 @@ Status MigrationManager::_processRemoteCommandResponse(
if (!Shard::shouldErrorBePropagated(commandStatus.code())) {
commandStatus = {ErrorCodes::OperationFailed,
stream() << "moveChunk command failed on source shard."
- << causedBy(commandStatus)};
+ << causedBy(commandStatus)
+ << remoteHostLog};
}
// Any failure to remove the migration document should be because the config server is
@@ -645,9 +665,13 @@ Status MigrationManager::_processRemoteCommandResponse(
stream() << "Migration interrupted because the balancer is stopping"
<< " and failed to remove the config.migrations document."
<< " Command status: "
- << (commandStatus.isOK() ? status.toString() : commandStatus.toString())};
+ << (commandStatus.isOK() ? status.toString() : commandStatus.toString())
+ << remoteHostLog};
}
+ if (!commandStatus.isOK()) {
+ warning() << commandStatus;
+ }
return commandStatus;
}
diff --git a/src/mongo/db/s/balancer/migration_manager.h b/src/mongo/db/s/balancer/migration_manager.h
index bf9a20a8037..979c3943e33 100644
--- a/src/mongo/db/s/balancer/migration_manager.h
+++ b/src/mongo/db/s/balancer/migration_manager.h
@@ -183,12 +183,13 @@ private:
* successful (or not done), schedules the migration request and returns a notification which
* can be used to obtain the outcome of the operation.
*/
- std::shared_ptr<Notification<executor::RemoteCommandResponse>> _schedule(
- OperationContext* opCtx,
- const MigrateInfo& migrateInfo,
- uint64_t maxChunkSizeBytes,
- const MigrationSecondaryThrottleOptions& secondaryThrottle,
- bool waitForDelete);
+ std::pair<std::shared_ptr<Notification<executor::RemoteCommandResponse>>,
+ boost::optional<HostAndPort>>
+ _schedule(OperationContext* opCtx,
+ const MigrateInfo& migrateInfo,
+ uint64_t maxChunkSizeBytes,
+ const MigrationSecondaryThrottleOptions& secondaryThrottle,
+ bool waitForDelete);
/**
* Acquires the collection distributed lock for the specified namespace and if it succeeds,
@@ -246,7 +247,8 @@ private:
*/
Status _processRemoteCommandResponse(
const executor::RemoteCommandResponse& remoteCommandResponse,
- ScopedMigrationRequest* scopedMigrationRequest);
+ ScopedMigrationRequest* scopedMigrationRequest,
+ const boost::optional<HostAndPort>& remoteHost);
// The service context under which this migration manager runs.
ServiceContext* const _serviceContext;