diff options
author | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2018-01-19 16:24:05 -0500 |
---|---|---|
committer | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2018-01-24 13:45:53 -0500 |
commit | bc433b50e0205dfd0a8bfb6906393d841fd8193a (patch) | |
tree | 969334ca08f56dd05254f4aa10b69d74dc080e65 /src/mongo/db | |
parent | fbf03e93dad1d2d081944c05436e777380873eb2 (diff) | |
download | mongo-bc433b50e0205dfd0a8bfb6906393d841fd8193a.tar.gz |
SERVER-28670 Add sharding CatalogCache and donor metrics to serverStatus
Includes metrics for refresh, clone and migration critical section
duration.
Diffstat (limited to 'src/mongo/db')
-rw-r--r-- | src/mongo/db/s/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/db/s/active_migrations_registry.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/s/migration_chunk_cloner_source_legacy_commands.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/s/migration_destination_manager.cpp | 23 | ||||
-rw-r--r-- | src/mongo/db/s/migration_source_manager.cpp | 18 | ||||
-rw-r--r-- | src/mongo/db/s/migration_source_manager.h | 12 | ||||
-rw-r--r-- | src/mongo/db/s/sharding_server_status.cpp | 84 | ||||
-rw-r--r-- | src/mongo/db/s/sharding_state.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/s/sharding_statistics.cpp | 62 | ||||
-rw-r--r-- | src/mongo/db/s/sharding_statistics.h | 85 |
10 files changed, 250 insertions, 42 deletions
diff --git a/src/mongo/db/s/SConscript b/src/mongo/db/s/SConscript index 04f87ed67e3..c6929a5c1b1 100644 --- a/src/mongo/db/s/SConscript +++ b/src/mongo/db/s/SConscript @@ -86,6 +86,7 @@ env.Library( 'sharding_initialization_mongod.cpp', 'sharding_state.cpp', 'sharding_state_recovery.cpp', + 'sharding_statistics.cpp', 'split_chunk.cpp', 'split_vector.cpp', ], diff --git a/src/mongo/db/s/active_migrations_registry.cpp b/src/mongo/db/s/active_migrations_registry.cpp index 9aadaeb1937..a8006a06e6f 100644 --- a/src/mongo/db/s/active_migrations_registry.cpp +++ b/src/mongo/db/s/active_migrations_registry.cpp @@ -109,7 +109,7 @@ BSONObj ActiveMigrationsRegistry::getActiveMigrationStatusReport(OperationContex AutoGetCollection autoColl(opCtx, nss.get(), MODE_IS); auto css = CollectionShardingState::get(opCtx, nss.get()); - if (css && css->getMigrationSourceManager()) { + if (css->getMigrationSourceManager()) { return css->getMigrationSourceManager()->getMigrationStatusReport(); } } diff --git a/src/mongo/db/s/migration_chunk_cloner_source_legacy_commands.cpp b/src/mongo/db/s/migration_chunk_cloner_source_legacy_commands.cpp index 2b36c3efa30..a22d83f15f5 100644 --- a/src/mongo/db/s/migration_chunk_cloner_source_legacy_commands.cpp +++ b/src/mongo/db/s/migration_chunk_cloner_source_legacy_commands.cpp @@ -75,7 +75,7 @@ public: auto css = CollectionShardingState::get(opCtx, *nss); uassert(ErrorCodes::IllegalOperation, str::stream() << "No active migrations were found for collection " << nss->ns(), - css && css->getMigrationSourceManager()); + css->getMigrationSourceManager()); // It is now safe to access the cloner _chunkCloner = dynamic_cast<MigrationChunkClonerSourceLegacy*>( diff --git a/src/mongo/db/s/migration_destination_manager.cpp b/src/mongo/db/s/migration_destination_manager.cpp index 50a038e26c4..a575b0a3018 100644 --- a/src/mongo/db/s/migration_destination_manager.cpp +++ b/src/mongo/db/s/migration_destination_manager.cpp @@ -64,10 +64,6 @@ #include "mongo/util/mongoutils/str.h" namespace mongo { - -using std::string; -using str::stream; - namespace { const WriteConcernOptions kMajorityWriteConcern(WriteConcernOptions::kMajority, @@ -82,7 +78,7 @@ const WriteConcernOptions kMajorityWriteConcern(WriteConcernOptions::kMajority, /** * Returns a human-readabale name of the migration manager's state. */ -string stateToString(MigrationDestinationManager::State state) { +std::string stateToString(MigrationDestinationManager::State state) { switch (state) { case MigrationDestinationManager::READY: return "ready"; @@ -350,7 +346,6 @@ Status MigrationDestinationManager::start(const NamespaceString& nss, } Status MigrationDestinationManager::abort(const MigrationSessionId& sessionId) { - stdx::lock_guard<stdx::mutex> sl(_mutex); if (!_sessionId) { @@ -733,11 +728,10 @@ void MigrationDestinationManager::_migrateDriver(OperationContext* opCtx, cx.db(), docToClone, &localDoc)) { - string errMsg = str::stream() << "cannot migrate chunk, local document " - << redact(localDoc) - << " has same _id as cloned " - << "remote document " << redact(docToClone); - + const std::string errMsg = str::stream() + << "cannot migrate chunk, local document " << redact(localDoc) + << " has same _id as cloned " + << "remote document " << redact(docToClone); warning() << errMsg; // Exception will abort migration cleanly @@ -1008,10 +1002,9 @@ bool MigrationDestinationManager::_applyMigrateOp(OperationContext* opCtx, BSONObj localDoc; if (willOverrideLocalId( opCtx, nss, min, max, shardKeyPattern, cx.db(), updatedDoc, &localDoc)) { - string errMsg = str::stream() << "cannot migrate chunk, local document " << localDoc - << " has same _id as reloaded remote document " - << updatedDoc; - + const std::string errMsg = str::stream() + << "cannot migrate chunk, local document " << localDoc + << " has same _id as reloaded remote document " << updatedDoc; warning() << errMsg; // Exception will abort migration cleanly diff --git a/src/mongo/db/s/migration_source_manager.cpp b/src/mongo/db/s/migration_source_manager.cpp index c2cbdf5b807..f50e23bb550 100644 --- a/src/mongo/db/s/migration_source_manager.cpp +++ b/src/mongo/db/s/migration_source_manager.cpp @@ -41,6 +41,7 @@ #include "mongo/db/s/shard_metadata_util.h" #include "mongo/db/s/sharding_state.h" #include "mongo/db/s/sharding_state_recovery.h" +#include "mongo/db/s/sharding_statistics.h" #include "mongo/executor/task_executor.h" #include "mongo/executor/task_executor_pool.h" #include "mongo/s/catalog/sharding_catalog_client.h" @@ -133,7 +134,8 @@ MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx, HostAndPort recipientHost) : _args(std::move(request)), _donorConnStr(std::move(donorConnStr)), - _recipientHost(std::move(recipientHost)) { + _recipientHost(std::move(recipientHost)), + _stats(ShardingStatistics::get(opCtx)) { invariant(!opCtx->lockState()->isLocked()); // Disallow moving a chunk to ourselves @@ -208,6 +210,7 @@ MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx, MigrationSourceManager::~MigrationSourceManager() { invariant(!_cloneDriver); + _stats.totalDonorMoveChunkTimeMillis.addAndFetch(_entireOpTimer.millis()); } NamespaceString MigrationSourceManager::getNss() const { @@ -218,6 +221,7 @@ Status MigrationSourceManager::startClone(OperationContext* opCtx) { invariant(!opCtx->lockState()->isLocked()); invariant(_state == kCreated); auto scopedGuard = MakeGuard([&] { cleanupOnError(opCtx); }); + _stats.countDonorMoveChunkStarted.addAndFetch(1); Grid::get(opCtx) ->catalogClient() @@ -231,6 +235,8 @@ Status MigrationSourceManager::startClone(OperationContext* opCtx) { ShardingCatalogClient::kMajorityWriteConcern) .ignore(); + _cloneAndCommitTimer.reset(); + { // Register for notifications from the replication subsystem AutoGetCollection autoColl(opCtx, getNss(), MODE_IX, MODE_X); @@ -265,6 +271,8 @@ Status MigrationSourceManager::awaitToCatchUp(OperationContext* opCtx) { invariant(!opCtx->lockState()->isLocked()); invariant(_state == kCloning); auto scopedGuard = MakeGuard([&] { cleanupOnError(opCtx); }); + _stats.totalDonorChunkCloneTimeMillis.addAndFetch(_cloneAndCommitTimer.millis()); + _cloneAndCommitTimer.reset(); // Block until the cloner deems it appropriate to enter the critical section. Status catchUpStatus = _cloneDriver->awaitUntilCriticalSectionIsAppropriate( @@ -282,6 +290,8 @@ Status MigrationSourceManager::enterCriticalSection(OperationContext* opCtx) { invariant(!opCtx->lockState()->isLocked()); invariant(_state == kCloneCaughtUp); auto scopedGuard = MakeGuard([&] { cleanupOnError(opCtx); }); + _stats.totalDonorChunkCloneTimeMillis.addAndFetch(_cloneAndCommitTimer.millis()); + _cloneAndCommitTimer.reset(); { const auto metadata = [&] { @@ -417,6 +427,8 @@ Status MigrationSourceManager::commitChunkMetadataOnConfig(OperationContext* opC _readsShouldWaitOnCritSec = true; } + Timer t; + auto commitChunkMigrationResponse = Grid::get(opCtx)->shardRegistry()->getConfigShard()->runCommandWithFixedRetryAttempts( opCtx, @@ -527,6 +539,8 @@ Status MigrationSourceManager::commitChunkMetadataOnConfig(OperationContext* opC scopedGuard.Dismiss(); + _stats.totalCriticalSectionCommitTimeMillis.addAndFetch(t.millis()); + // Exit the critical section and ensure that all the necessary state is fully persisted before // scheduling orphan cleanup. _cleanup(opCtx); @@ -659,6 +673,8 @@ void MigrationSourceManager::_cleanup(OperationContext* opCtx) { } if (_state == kCriticalSection || _state == kCloneCompleted) { + _stats.totalCriticalSectionTimeMillis.addAndFetch(_cloneAndCommitTimer.millis()); + // NOTE: The order of the operations below is important and the comments explain the // reasoning behind it diff --git a/src/mongo/db/s/migration_source_manager.h b/src/mongo/db/s/migration_source_manager.h index d6a6df981c0..635fed591f7 100644 --- a/src/mongo/db/s/migration_source_manager.h +++ b/src/mongo/db/s/migration_source_manager.h @@ -40,6 +40,7 @@ namespace mongo { class MigrationChunkClonerSource; class OperationContext; +struct ShardingStatistics; /** * The donor-side migration state machine. This object must be created and owned by a single thread, @@ -211,8 +212,15 @@ private: // The resolved primary of the recipient shard const HostAndPort _recipientHost; - // Gets initialized at creation time and will time the entire move chunk operation - const Timer _startTime; + // Stores a reference to the process sharding statistics object which needs to be updated + ShardingStatistics& _stats; + + // Times the entire moveChunk operation + const Timer _entireOpTimer; + + // Starts counting from creation time and is used to time various parts from the lifetime of the + // move chunk sequence + Timer _cloneAndCommitTimer; // The current state. Used only for diagnostics and validation. State _state{kCreated}; diff --git a/src/mongo/db/s/sharding_server_status.cpp b/src/mongo/db/s/sharding_server_status.cpp index 248a4669587..2b5e7600f5b 100644 --- a/src/mongo/db/s/sharding_server_status.cpp +++ b/src/mongo/db/s/sharding_server_status.cpp @@ -31,45 +31,57 @@ #include "mongo/bson/bsonobjbuilder.h" #include "mongo/db/commands/server_status.h" #include "mongo/db/s/sharding_state.h" +#include "mongo/db/s/sharding_statistics.h" #include "mongo/db/server_options.h" #include "mongo/s/balancer_configuration.h" +#include "mongo/s/catalog_cache.h" +#include "mongo/s/client/shard_registry.h" #include "mongo/s/grid.h" namespace mongo { namespace { -class ShardingServerStatus : public ServerStatusSection { +bool isClusterNode() { + return serverGlobalParams.clusterRole != ClusterRole::None; +} + +class ShardingServerStatus final : public ServerStatusSection { public: ShardingServerStatus() : ServerStatusSection("sharding") {} - bool includeByDefault() const final { - return true; + bool includeByDefault() const override { + return isClusterNode(); } - BSONObj generateSection(OperationContext* opCtx, const BSONElement& configElement) const final { - BSONObjBuilder result; + BSONObj generateSection(OperationContext* opCtx, + const BSONElement& configElement) const override { + if (!isClusterNode()) + return {}; + + auto const shardingState = ShardingState::get(opCtx); + if (!shardingState->enabled()) + return {}; - auto shardingState = ShardingState::get(opCtx); - if (shardingState->enabled() && - serverGlobalParams.clusterRole != ClusterRole::ConfigServer) { + auto const grid = Grid::get(opCtx); + auto const shardRegistry = grid->shardRegistry(); + + BSONObjBuilder result; - result.append("configsvrConnectionString", - shardingState->getConfigServer(opCtx).toString()); + result.append("configsvrConnectionString", + shardRegistry->getConfigServerConnectionString().toString()); - Grid::get(opCtx)->configOpTime().append(&result, "lastSeenConfigServerOpTime"); + grid->configOpTime().append(&result, "lastSeenConfigServerOpTime"); - long long maxChunkSizeInBytes = - Grid::get(opCtx)->getBalancerConfiguration()->getMaxChunkSizeBytes(); - result.append("maxChunkSizeInBytes", maxChunkSizeInBytes); + const long long maxChunkSizeInBytes = + grid->getBalancerConfiguration()->getMaxChunkSizeBytes(); + result.append("maxChunkSizeInBytes", maxChunkSizeInBytes); - // Get a migration status report if a migration is active for which this is the source - // shard. ShardingState::getActiveMigrationStatusReport will take an IS lock on the - // namespace of the active migration if there is one that is active. - BSONObj migrationStatus = - ShardingState::get(opCtx)->getActiveMigrationStatusReport(opCtx); - if (!migrationStatus.isEmpty()) { - result.append("migrations", migrationStatus); - } + // Get a migration status report if a migration is active for which this is the source + // shard. ShardingState::getActiveMigrationStatusReport will take an IS lock on the + // namespace of the active migration if there is one that is active. + BSONObj migrationStatus = shardingState->getActiveMigrationStatusReport(opCtx); + if (!migrationStatus.isEmpty()) { + result.append("migrations", migrationStatus); } return result.obj(); @@ -77,5 +89,33 @@ public: } shardingServerStatus; +class ShardingStatisticsServerStatus final : public ServerStatusSection { +public: + ShardingStatisticsServerStatus() : ServerStatusSection("shardingStatistics") {} + + bool includeByDefault() const override { + return isClusterNode(); + } + + BSONObj generateSection(OperationContext* opCtx, + const BSONElement& configElement) const override { + if (!isClusterNode()) + return {}; + + auto const shardingState = ShardingState::get(opCtx); + if (!shardingState->enabled()) + return {}; + + auto const grid = Grid::get(opCtx); + auto const catalogCache = grid->catalogCache(); + + BSONObjBuilder result; + ShardingStatistics::get(opCtx).report(&result); + catalogCache->report(&result); + return result.obj(); + } + +} shardingStatisticsServerStatus; + } // namespace } // namespace mongo diff --git a/src/mongo/db/s/sharding_state.cpp b/src/mongo/db/s/sharding_state.cpp index 81a60c2be60..77298091f12 100644 --- a/src/mongo/db/s/sharding_state.cpp +++ b/src/mongo/db/s/sharding_state.cpp @@ -49,6 +49,7 @@ #include "mongo/db/s/operation_sharding_state.h" #include "mongo/db/s/sharded_connection_info.h" #include "mongo/db/s/sharding_initialization_mongod.h" +#include "mongo/db/s/sharding_statistics.h" #include "mongo/db/s/type_shard_identity.h" #include "mongo/executor/network_interface_factory.h" #include "mongo/executor/network_interface_thread_pool.h" @@ -225,6 +226,8 @@ Status ShardingState::onStaleShardVersion(OperationContext* opCtx, LOG(2) << "metadata refresh requested for " << nss.ns() << " at shard version " << expectedVersion; + ShardingStatistics::get(opCtx).countStaleConfigErrors.addAndFetch(1); + // Ensure any ongoing migrations have completed auto& oss = OperationShardingState::get(opCtx); oss.waitForMigrationCriticalSectionSignal(opCtx); diff --git a/src/mongo/db/s/sharding_statistics.cpp b/src/mongo/db/s/sharding_statistics.cpp new file mode 100644 index 00000000000..905f1869bf9 --- /dev/null +++ b/src/mongo/db/s/sharding_statistics.cpp @@ -0,0 +1,62 @@ +/** + * Copyright (C) 2018 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects + * for all of the code used other than as permitted herein. If you modify + * file(s) with this exception, you may extend this exception to your + * version of the file(s), but you are not obligated to do so. If you do not + * wish to do so, delete this exception statement from your version. If you + * delete this exception statement from all source files in the program, + * then also delete it in the license file. + */ + +#include "mongo/platform/basic.h" + +#include "mongo/db/s/sharding_statistics.h" + +#include "mongo/bson/bsonobjbuilder.h" +#include "mongo/db/operation_context.h" +#include "mongo/db/service_context.h" + +namespace mongo { +namespace { + +const auto getShardingStatistics = ServiceContext::declareDecoration<ShardingStatistics>(); + +} // namespace + +ShardingStatistics& ShardingStatistics::get(ServiceContext* serviceContext) { + return getShardingStatistics(serviceContext); +} + +ShardingStatistics& ShardingStatistics::get(OperationContext* opCtx) { + return get(opCtx->getServiceContext()); +} + +void ShardingStatistics::report(BSONObjBuilder* builder) const { + builder->append("countStaleConfigErrors", countStaleConfigErrors.load()); + + builder->append("countDonorMoveChunkStarted", countDonorMoveChunkStarted.load()); + builder->append("totalDonorChunkCloneTimeMillis", totalDonorChunkCloneTimeMillis.load()); + builder->append("totalCriticalSectionCommitTimeMillis", + totalCriticalSectionCommitTimeMillis.load()); + builder->append("totalCriticalSectionTimeMillis", totalCriticalSectionTimeMillis.load()); +} + +} // namespace mongo diff --git a/src/mongo/db/s/sharding_statistics.h b/src/mongo/db/s/sharding_statistics.h new file mode 100644 index 00000000000..c4eb7abd728 --- /dev/null +++ b/src/mongo/db/s/sharding_statistics.h @@ -0,0 +1,85 @@ +/** + * Copyright (C) 2015 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects + * for all of the code used other than as permitted herein. If you modify + * file(s) with this exception, you may extend this exception to your + * version of the file(s), but you are not obligated to do so. If you do not + * wish to do so, delete this exception statement from your version. If you + * delete this exception statement from all source files in the program, + * then also delete it in the license file. + */ + +#pragma once + +#include "mongo/platform/atomic_word.h" + +namespace mongo { + +class BSONObjBuilder; +class OperationContext; +class ServiceContext; + +/** + * Encapsulates per-process statistics for the sharding subsystem. + */ +struct ShardingStatistics { + // Counts how many times threads hit stale config exception (which is what triggers metadata + // refreshes) + AtomicInt64 countStaleConfigErrors{0}; + + // Cumulative, always-increasing counter of how many chunks did this node start donating + // (whether they succeeded or not) + AtomicInt64 countDonorMoveChunkStarted{0}; + + // Cumulative, always-increasing counter of how much time the entire move chunk operation took + // (excluding range deletion) + AtomicInt64 totalDonorMoveChunkTimeMillis{0}; + + // Cumulative, always-increasing counter of how much time the clone phase took on the donor + // node, before it was appropriate to enter the critical section + AtomicInt64 totalDonorChunkCloneTimeMillis{0}; + + // Cumulative, always-increasing counter of how much time the critical section's commit phase + // took (this is the period of time when all operations on the collection are blocked, not just + // the reads) + AtomicInt64 totalCriticalSectionCommitTimeMillis{0}; + + // Cumulative, always-increasing counter of how much time the entire critical section took. It + // includes the time the recipient took to fetch the latest modifications from the donor and + // persist them plus the critical section commit time. + // + // The value of totalCriticalSectionTimeMillis - totalCriticalSectionCommitTimeMillis gives the + // duration of the catch-up phase of the critical section (where the last mods are transferred + // from the donor to the recipient). + AtomicInt64 totalCriticalSectionTimeMillis{0}; + + /** + * Obtains the per-process instance of the sharding statistics object. + */ + static ShardingStatistics& get(ServiceContext* serviceContext); + static ShardingStatistics& get(OperationContext* opCtx); + + /** + * Reports the accumulated statistics for serverStatus. + */ + void report(BSONObjBuilder* builder) const; +}; + +} // namespace mongo |