summaryrefslogtreecommitdiff
path: root/src/mongo/db
diff options
context:
space:
mode:
authorKaloian Manassiev <kaloian.manassiev@mongodb.com>2018-01-19 16:24:05 -0500
committerKaloian Manassiev <kaloian.manassiev@mongodb.com>2018-01-24 13:45:53 -0500
commitbc433b50e0205dfd0a8bfb6906393d841fd8193a (patch)
tree969334ca08f56dd05254f4aa10b69d74dc080e65 /src/mongo/db
parentfbf03e93dad1d2d081944c05436e777380873eb2 (diff)
downloadmongo-bc433b50e0205dfd0a8bfb6906393d841fd8193a.tar.gz
SERVER-28670 Add sharding CatalogCache and donor metrics to serverStatus
Includes metrics for refresh, clone and migration critical section duration.
Diffstat (limited to 'src/mongo/db')
-rw-r--r--src/mongo/db/s/SConscript1
-rw-r--r--src/mongo/db/s/active_migrations_registry.cpp2
-rw-r--r--src/mongo/db/s/migration_chunk_cloner_source_legacy_commands.cpp2
-rw-r--r--src/mongo/db/s/migration_destination_manager.cpp23
-rw-r--r--src/mongo/db/s/migration_source_manager.cpp18
-rw-r--r--src/mongo/db/s/migration_source_manager.h12
-rw-r--r--src/mongo/db/s/sharding_server_status.cpp84
-rw-r--r--src/mongo/db/s/sharding_state.cpp3
-rw-r--r--src/mongo/db/s/sharding_statistics.cpp62
-rw-r--r--src/mongo/db/s/sharding_statistics.h85
10 files changed, 250 insertions, 42 deletions
diff --git a/src/mongo/db/s/SConscript b/src/mongo/db/s/SConscript
index 04f87ed67e3..c6929a5c1b1 100644
--- a/src/mongo/db/s/SConscript
+++ b/src/mongo/db/s/SConscript
@@ -86,6 +86,7 @@ env.Library(
'sharding_initialization_mongod.cpp',
'sharding_state.cpp',
'sharding_state_recovery.cpp',
+ 'sharding_statistics.cpp',
'split_chunk.cpp',
'split_vector.cpp',
],
diff --git a/src/mongo/db/s/active_migrations_registry.cpp b/src/mongo/db/s/active_migrations_registry.cpp
index 9aadaeb1937..a8006a06e6f 100644
--- a/src/mongo/db/s/active_migrations_registry.cpp
+++ b/src/mongo/db/s/active_migrations_registry.cpp
@@ -109,7 +109,7 @@ BSONObj ActiveMigrationsRegistry::getActiveMigrationStatusReport(OperationContex
AutoGetCollection autoColl(opCtx, nss.get(), MODE_IS);
auto css = CollectionShardingState::get(opCtx, nss.get());
- if (css && css->getMigrationSourceManager()) {
+ if (css->getMigrationSourceManager()) {
return css->getMigrationSourceManager()->getMigrationStatusReport();
}
}
diff --git a/src/mongo/db/s/migration_chunk_cloner_source_legacy_commands.cpp b/src/mongo/db/s/migration_chunk_cloner_source_legacy_commands.cpp
index 2b36c3efa30..a22d83f15f5 100644
--- a/src/mongo/db/s/migration_chunk_cloner_source_legacy_commands.cpp
+++ b/src/mongo/db/s/migration_chunk_cloner_source_legacy_commands.cpp
@@ -75,7 +75,7 @@ public:
auto css = CollectionShardingState::get(opCtx, *nss);
uassert(ErrorCodes::IllegalOperation,
str::stream() << "No active migrations were found for collection " << nss->ns(),
- css && css->getMigrationSourceManager());
+ css->getMigrationSourceManager());
// It is now safe to access the cloner
_chunkCloner = dynamic_cast<MigrationChunkClonerSourceLegacy*>(
diff --git a/src/mongo/db/s/migration_destination_manager.cpp b/src/mongo/db/s/migration_destination_manager.cpp
index 50a038e26c4..a575b0a3018 100644
--- a/src/mongo/db/s/migration_destination_manager.cpp
+++ b/src/mongo/db/s/migration_destination_manager.cpp
@@ -64,10 +64,6 @@
#include "mongo/util/mongoutils/str.h"
namespace mongo {
-
-using std::string;
-using str::stream;
-
namespace {
const WriteConcernOptions kMajorityWriteConcern(WriteConcernOptions::kMajority,
@@ -82,7 +78,7 @@ const WriteConcernOptions kMajorityWriteConcern(WriteConcernOptions::kMajority,
/**
* Returns a human-readabale name of the migration manager's state.
*/
-string stateToString(MigrationDestinationManager::State state) {
+std::string stateToString(MigrationDestinationManager::State state) {
switch (state) {
case MigrationDestinationManager::READY:
return "ready";
@@ -350,7 +346,6 @@ Status MigrationDestinationManager::start(const NamespaceString& nss,
}
Status MigrationDestinationManager::abort(const MigrationSessionId& sessionId) {
-
stdx::lock_guard<stdx::mutex> sl(_mutex);
if (!_sessionId) {
@@ -733,11 +728,10 @@ void MigrationDestinationManager::_migrateDriver(OperationContext* opCtx,
cx.db(),
docToClone,
&localDoc)) {
- string errMsg = str::stream() << "cannot migrate chunk, local document "
- << redact(localDoc)
- << " has same _id as cloned "
- << "remote document " << redact(docToClone);
-
+ const std::string errMsg = str::stream()
+ << "cannot migrate chunk, local document " << redact(localDoc)
+ << " has same _id as cloned "
+ << "remote document " << redact(docToClone);
warning() << errMsg;
// Exception will abort migration cleanly
@@ -1008,10 +1002,9 @@ bool MigrationDestinationManager::_applyMigrateOp(OperationContext* opCtx,
BSONObj localDoc;
if (willOverrideLocalId(
opCtx, nss, min, max, shardKeyPattern, cx.db(), updatedDoc, &localDoc)) {
- string errMsg = str::stream() << "cannot migrate chunk, local document " << localDoc
- << " has same _id as reloaded remote document "
- << updatedDoc;
-
+ const std::string errMsg = str::stream()
+ << "cannot migrate chunk, local document " << localDoc
+ << " has same _id as reloaded remote document " << updatedDoc;
warning() << errMsg;
// Exception will abort migration cleanly
diff --git a/src/mongo/db/s/migration_source_manager.cpp b/src/mongo/db/s/migration_source_manager.cpp
index c2cbdf5b807..f50e23bb550 100644
--- a/src/mongo/db/s/migration_source_manager.cpp
+++ b/src/mongo/db/s/migration_source_manager.cpp
@@ -41,6 +41,7 @@
#include "mongo/db/s/shard_metadata_util.h"
#include "mongo/db/s/sharding_state.h"
#include "mongo/db/s/sharding_state_recovery.h"
+#include "mongo/db/s/sharding_statistics.h"
#include "mongo/executor/task_executor.h"
#include "mongo/executor/task_executor_pool.h"
#include "mongo/s/catalog/sharding_catalog_client.h"
@@ -133,7 +134,8 @@ MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx,
HostAndPort recipientHost)
: _args(std::move(request)),
_donorConnStr(std::move(donorConnStr)),
- _recipientHost(std::move(recipientHost)) {
+ _recipientHost(std::move(recipientHost)),
+ _stats(ShardingStatistics::get(opCtx)) {
invariant(!opCtx->lockState()->isLocked());
// Disallow moving a chunk to ourselves
@@ -208,6 +210,7 @@ MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx,
MigrationSourceManager::~MigrationSourceManager() {
invariant(!_cloneDriver);
+ _stats.totalDonorMoveChunkTimeMillis.addAndFetch(_entireOpTimer.millis());
}
NamespaceString MigrationSourceManager::getNss() const {
@@ -218,6 +221,7 @@ Status MigrationSourceManager::startClone(OperationContext* opCtx) {
invariant(!opCtx->lockState()->isLocked());
invariant(_state == kCreated);
auto scopedGuard = MakeGuard([&] { cleanupOnError(opCtx); });
+ _stats.countDonorMoveChunkStarted.addAndFetch(1);
Grid::get(opCtx)
->catalogClient()
@@ -231,6 +235,8 @@ Status MigrationSourceManager::startClone(OperationContext* opCtx) {
ShardingCatalogClient::kMajorityWriteConcern)
.ignore();
+ _cloneAndCommitTimer.reset();
+
{
// Register for notifications from the replication subsystem
AutoGetCollection autoColl(opCtx, getNss(), MODE_IX, MODE_X);
@@ -265,6 +271,8 @@ Status MigrationSourceManager::awaitToCatchUp(OperationContext* opCtx) {
invariant(!opCtx->lockState()->isLocked());
invariant(_state == kCloning);
auto scopedGuard = MakeGuard([&] { cleanupOnError(opCtx); });
+ _stats.totalDonorChunkCloneTimeMillis.addAndFetch(_cloneAndCommitTimer.millis());
+ _cloneAndCommitTimer.reset();
// Block until the cloner deems it appropriate to enter the critical section.
Status catchUpStatus = _cloneDriver->awaitUntilCriticalSectionIsAppropriate(
@@ -282,6 +290,8 @@ Status MigrationSourceManager::enterCriticalSection(OperationContext* opCtx) {
invariant(!opCtx->lockState()->isLocked());
invariant(_state == kCloneCaughtUp);
auto scopedGuard = MakeGuard([&] { cleanupOnError(opCtx); });
+ _stats.totalDonorChunkCloneTimeMillis.addAndFetch(_cloneAndCommitTimer.millis());
+ _cloneAndCommitTimer.reset();
{
const auto metadata = [&] {
@@ -417,6 +427,8 @@ Status MigrationSourceManager::commitChunkMetadataOnConfig(OperationContext* opC
_readsShouldWaitOnCritSec = true;
}
+ Timer t;
+
auto commitChunkMigrationResponse =
Grid::get(opCtx)->shardRegistry()->getConfigShard()->runCommandWithFixedRetryAttempts(
opCtx,
@@ -527,6 +539,8 @@ Status MigrationSourceManager::commitChunkMetadataOnConfig(OperationContext* opC
scopedGuard.Dismiss();
+ _stats.totalCriticalSectionCommitTimeMillis.addAndFetch(t.millis());
+
// Exit the critical section and ensure that all the necessary state is fully persisted before
// scheduling orphan cleanup.
_cleanup(opCtx);
@@ -659,6 +673,8 @@ void MigrationSourceManager::_cleanup(OperationContext* opCtx) {
}
if (_state == kCriticalSection || _state == kCloneCompleted) {
+ _stats.totalCriticalSectionTimeMillis.addAndFetch(_cloneAndCommitTimer.millis());
+
// NOTE: The order of the operations below is important and the comments explain the
// reasoning behind it
diff --git a/src/mongo/db/s/migration_source_manager.h b/src/mongo/db/s/migration_source_manager.h
index d6a6df981c0..635fed591f7 100644
--- a/src/mongo/db/s/migration_source_manager.h
+++ b/src/mongo/db/s/migration_source_manager.h
@@ -40,6 +40,7 @@ namespace mongo {
class MigrationChunkClonerSource;
class OperationContext;
+struct ShardingStatistics;
/**
* The donor-side migration state machine. This object must be created and owned by a single thread,
@@ -211,8 +212,15 @@ private:
// The resolved primary of the recipient shard
const HostAndPort _recipientHost;
- // Gets initialized at creation time and will time the entire move chunk operation
- const Timer _startTime;
+ // Stores a reference to the process sharding statistics object which needs to be updated
+ ShardingStatistics& _stats;
+
+ // Times the entire moveChunk operation
+ const Timer _entireOpTimer;
+
+ // Starts counting from creation time and is used to time various parts from the lifetime of the
+ // move chunk sequence
+ Timer _cloneAndCommitTimer;
// The current state. Used only for diagnostics and validation.
State _state{kCreated};
diff --git a/src/mongo/db/s/sharding_server_status.cpp b/src/mongo/db/s/sharding_server_status.cpp
index 248a4669587..2b5e7600f5b 100644
--- a/src/mongo/db/s/sharding_server_status.cpp
+++ b/src/mongo/db/s/sharding_server_status.cpp
@@ -31,45 +31,57 @@
#include "mongo/bson/bsonobjbuilder.h"
#include "mongo/db/commands/server_status.h"
#include "mongo/db/s/sharding_state.h"
+#include "mongo/db/s/sharding_statistics.h"
#include "mongo/db/server_options.h"
#include "mongo/s/balancer_configuration.h"
+#include "mongo/s/catalog_cache.h"
+#include "mongo/s/client/shard_registry.h"
#include "mongo/s/grid.h"
namespace mongo {
namespace {
-class ShardingServerStatus : public ServerStatusSection {
+bool isClusterNode() {
+ return serverGlobalParams.clusterRole != ClusterRole::None;
+}
+
+class ShardingServerStatus final : public ServerStatusSection {
public:
ShardingServerStatus() : ServerStatusSection("sharding") {}
- bool includeByDefault() const final {
- return true;
+ bool includeByDefault() const override {
+ return isClusterNode();
}
- BSONObj generateSection(OperationContext* opCtx, const BSONElement& configElement) const final {
- BSONObjBuilder result;
+ BSONObj generateSection(OperationContext* opCtx,
+ const BSONElement& configElement) const override {
+ if (!isClusterNode())
+ return {};
+
+ auto const shardingState = ShardingState::get(opCtx);
+ if (!shardingState->enabled())
+ return {};
- auto shardingState = ShardingState::get(opCtx);
- if (shardingState->enabled() &&
- serverGlobalParams.clusterRole != ClusterRole::ConfigServer) {
+ auto const grid = Grid::get(opCtx);
+ auto const shardRegistry = grid->shardRegistry();
+
+ BSONObjBuilder result;
- result.append("configsvrConnectionString",
- shardingState->getConfigServer(opCtx).toString());
+ result.append("configsvrConnectionString",
+ shardRegistry->getConfigServerConnectionString().toString());
- Grid::get(opCtx)->configOpTime().append(&result, "lastSeenConfigServerOpTime");
+ grid->configOpTime().append(&result, "lastSeenConfigServerOpTime");
- long long maxChunkSizeInBytes =
- Grid::get(opCtx)->getBalancerConfiguration()->getMaxChunkSizeBytes();
- result.append("maxChunkSizeInBytes", maxChunkSizeInBytes);
+ const long long maxChunkSizeInBytes =
+ grid->getBalancerConfiguration()->getMaxChunkSizeBytes();
+ result.append("maxChunkSizeInBytes", maxChunkSizeInBytes);
- // Get a migration status report if a migration is active for which this is the source
- // shard. ShardingState::getActiveMigrationStatusReport will take an IS lock on the
- // namespace of the active migration if there is one that is active.
- BSONObj migrationStatus =
- ShardingState::get(opCtx)->getActiveMigrationStatusReport(opCtx);
- if (!migrationStatus.isEmpty()) {
- result.append("migrations", migrationStatus);
- }
+ // Get a migration status report if a migration is active for which this is the source
+ // shard. ShardingState::getActiveMigrationStatusReport will take an IS lock on the
+ // namespace of the active migration if there is one that is active.
+ BSONObj migrationStatus = shardingState->getActiveMigrationStatusReport(opCtx);
+ if (!migrationStatus.isEmpty()) {
+ result.append("migrations", migrationStatus);
}
return result.obj();
@@ -77,5 +89,33 @@ public:
} shardingServerStatus;
+class ShardingStatisticsServerStatus final : public ServerStatusSection {
+public:
+ ShardingStatisticsServerStatus() : ServerStatusSection("shardingStatistics") {}
+
+ bool includeByDefault() const override {
+ return isClusterNode();
+ }
+
+ BSONObj generateSection(OperationContext* opCtx,
+ const BSONElement& configElement) const override {
+ if (!isClusterNode())
+ return {};
+
+ auto const shardingState = ShardingState::get(opCtx);
+ if (!shardingState->enabled())
+ return {};
+
+ auto const grid = Grid::get(opCtx);
+ auto const catalogCache = grid->catalogCache();
+
+ BSONObjBuilder result;
+ ShardingStatistics::get(opCtx).report(&result);
+ catalogCache->report(&result);
+ return result.obj();
+ }
+
+} shardingStatisticsServerStatus;
+
} // namespace
} // namespace mongo
diff --git a/src/mongo/db/s/sharding_state.cpp b/src/mongo/db/s/sharding_state.cpp
index 81a60c2be60..77298091f12 100644
--- a/src/mongo/db/s/sharding_state.cpp
+++ b/src/mongo/db/s/sharding_state.cpp
@@ -49,6 +49,7 @@
#include "mongo/db/s/operation_sharding_state.h"
#include "mongo/db/s/sharded_connection_info.h"
#include "mongo/db/s/sharding_initialization_mongod.h"
+#include "mongo/db/s/sharding_statistics.h"
#include "mongo/db/s/type_shard_identity.h"
#include "mongo/executor/network_interface_factory.h"
#include "mongo/executor/network_interface_thread_pool.h"
@@ -225,6 +226,8 @@ Status ShardingState::onStaleShardVersion(OperationContext* opCtx,
LOG(2) << "metadata refresh requested for " << nss.ns() << " at shard version "
<< expectedVersion;
+ ShardingStatistics::get(opCtx).countStaleConfigErrors.addAndFetch(1);
+
// Ensure any ongoing migrations have completed
auto& oss = OperationShardingState::get(opCtx);
oss.waitForMigrationCriticalSectionSignal(opCtx);
diff --git a/src/mongo/db/s/sharding_statistics.cpp b/src/mongo/db/s/sharding_statistics.cpp
new file mode 100644
index 00000000000..905f1869bf9
--- /dev/null
+++ b/src/mongo/db/s/sharding_statistics.cpp
@@ -0,0 +1,62 @@
+/**
+ * Copyright (C) 2018 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects
+ * for all of the code used other than as permitted herein. If you modify
+ * file(s) with this exception, you may extend this exception to your
+ * version of the file(s), but you are not obligated to do so. If you do not
+ * wish to do so, delete this exception statement from your version. If you
+ * delete this exception statement from all source files in the program,
+ * then also delete it in the license file.
+ */
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/s/sharding_statistics.h"
+
+#include "mongo/bson/bsonobjbuilder.h"
+#include "mongo/db/operation_context.h"
+#include "mongo/db/service_context.h"
+
+namespace mongo {
+namespace {
+
+const auto getShardingStatistics = ServiceContext::declareDecoration<ShardingStatistics>();
+
+} // namespace
+
+ShardingStatistics& ShardingStatistics::get(ServiceContext* serviceContext) {
+ return getShardingStatistics(serviceContext);
+}
+
+ShardingStatistics& ShardingStatistics::get(OperationContext* opCtx) {
+ return get(opCtx->getServiceContext());
+}
+
+void ShardingStatistics::report(BSONObjBuilder* builder) const {
+ builder->append("countStaleConfigErrors", countStaleConfigErrors.load());
+
+ builder->append("countDonorMoveChunkStarted", countDonorMoveChunkStarted.load());
+ builder->append("totalDonorChunkCloneTimeMillis", totalDonorChunkCloneTimeMillis.load());
+ builder->append("totalCriticalSectionCommitTimeMillis",
+ totalCriticalSectionCommitTimeMillis.load());
+ builder->append("totalCriticalSectionTimeMillis", totalCriticalSectionTimeMillis.load());
+}
+
+} // namespace mongo
diff --git a/src/mongo/db/s/sharding_statistics.h b/src/mongo/db/s/sharding_statistics.h
new file mode 100644
index 00000000000..c4eb7abd728
--- /dev/null
+++ b/src/mongo/db/s/sharding_statistics.h
@@ -0,0 +1,85 @@
+/**
+ * Copyright (C) 2015 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects
+ * for all of the code used other than as permitted herein. If you modify
+ * file(s) with this exception, you may extend this exception to your
+ * version of the file(s), but you are not obligated to do so. If you do not
+ * wish to do so, delete this exception statement from your version. If you
+ * delete this exception statement from all source files in the program,
+ * then also delete it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/platform/atomic_word.h"
+
+namespace mongo {
+
+class BSONObjBuilder;
+class OperationContext;
+class ServiceContext;
+
+/**
+ * Encapsulates per-process statistics for the sharding subsystem.
+ */
+struct ShardingStatistics {
+ // Counts how many times threads hit stale config exception (which is what triggers metadata
+ // refreshes)
+ AtomicInt64 countStaleConfigErrors{0};
+
+ // Cumulative, always-increasing counter of how many chunks did this node start donating
+ // (whether they succeeded or not)
+ AtomicInt64 countDonorMoveChunkStarted{0};
+
+ // Cumulative, always-increasing counter of how much time the entire move chunk operation took
+ // (excluding range deletion)
+ AtomicInt64 totalDonorMoveChunkTimeMillis{0};
+
+ // Cumulative, always-increasing counter of how much time the clone phase took on the donor
+ // node, before it was appropriate to enter the critical section
+ AtomicInt64 totalDonorChunkCloneTimeMillis{0};
+
+ // Cumulative, always-increasing counter of how much time the critical section's commit phase
+ // took (this is the period of time when all operations on the collection are blocked, not just
+ // the reads)
+ AtomicInt64 totalCriticalSectionCommitTimeMillis{0};
+
+ // Cumulative, always-increasing counter of how much time the entire critical section took. It
+ // includes the time the recipient took to fetch the latest modifications from the donor and
+ // persist them plus the critical section commit time.
+ //
+ // The value of totalCriticalSectionTimeMillis - totalCriticalSectionCommitTimeMillis gives the
+ // duration of the catch-up phase of the critical section (where the last mods are transferred
+ // from the donor to the recipient).
+ AtomicInt64 totalCriticalSectionTimeMillis{0};
+
+ /**
+ * Obtains the per-process instance of the sharding statistics object.
+ */
+ static ShardingStatistics& get(ServiceContext* serviceContext);
+ static ShardingStatistics& get(OperationContext* opCtx);
+
+ /**
+ * Reports the accumulated statistics for serverStatus.
+ */
+ void report(BSONObjBuilder* builder) const;
+};
+
+} // namespace mongo