summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBlake Oler <blake.oler@mongodb.com>2020-04-08 16:26:45 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-04-16 14:33:32 +0000
commit7545d9ffa5f7c980a1619d419e8fc03647e86cc5 (patch)
tree051fd3d0658cd86f6955efd2f969613eaa5561c8
parent20364840b8f1af16917e4c23c1b5f5efd8b352f8 (diff)
downloadmongo-7545d9ffa5f7c980a1619d419e8fc03647e86cc5.tar.gz
SERVER-46396 Add metrics to track number of operations blocked behind a catalog cache refresh
(cherry picked from commit 8d136e00f5a204609403075ccd96fceb8d6e8ea7) (chunk_bounds_util.js cherry picked from commit aefce507386045fe6a77eebcbec567000a6a98e5 authored by Cheahuychou Mao (cheahuychou on Github))
-rw-r--r--buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml1
-rw-r--r--jstests/sharding/catalog_cache_refresh_counters.js111
-rw-r--r--jstests/sharding/libs/chunk_bounds_util.js74
-rw-r--r--src/mongo/s/catalog_cache.cpp55
-rw-r--r--src/mongo/s/catalog_cache.h18
-rw-r--r--src/mongo/s/commands/cluster_find_cmd.cpp5
-rw-r--r--src/mongo/s/commands/cluster_write_cmd.cpp7
-rw-r--r--src/mongo/s/commands/strategy.cpp16
8 files changed, 286 insertions, 1 deletions
diff --git a/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml b/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml
index 266f193a29c..b04b769e8ac 100644
--- a/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml
+++ b/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml
@@ -62,6 +62,7 @@ selector:
- jstests/sharding/agg_write_stages_cannot_run_on_mongos.js
# Enable when SERVER-40435 is backported to last stable 4.0
- jstests/sharding/clear_jumbo.js
+ - jstests/sharding/catalog_cache_refresh_counters.js
- jstests/sharding/explain_agg_read_pref.js
- jstests/sharding/explain_cmd.js
- jstests/sharding/failcommand_failpoint_not_parallel.js
diff --git a/jstests/sharding/catalog_cache_refresh_counters.js b/jstests/sharding/catalog_cache_refresh_counters.js
new file mode 100644
index 00000000000..67a237d881e
--- /dev/null
+++ b/jstests/sharding/catalog_cache_refresh_counters.js
@@ -0,0 +1,111 @@
+/**
+ * Test that operations that get blocked by catalog cache refreshes get logged to
+ * shardingStatistics.
+ */
+
+(function() {
+'use strict';
+
+load('jstests/sharding/libs/sharded_transactions_helpers.js');
+load("jstests/sharding/libs/chunk_bounds_util.js");
+
+let st = new ShardingTest({mongos: 2, shards: 2});
+const configDB = st.s.getDB('config');
+const dbName = "test";
+const collName = "foo";
+const ns = dbName + "." + collName;
+const mongos0DB = st.s0.getDB(dbName);
+const mongos1DB = st.s1.getDB(dbName);
+const mongos0Coll = mongos0DB[collName];
+const mongos1Coll = mongos1DB[collName];
+
+let setUp = () => {
+ /**
+ * Sets up a test by moving chunks to such that one chunk is on each
+ * shard, with the following distribution:
+ * shard0: [-inf, 0)
+ * shard1: [0, inf)
+ */
+ assert.commandWorked(st.s0.adminCommand({enableSharding: dbName}));
+ assert.commandWorked(st.s0.adminCommand({movePrimary: dbName, to: st.shard0.shardName}));
+ assert.commandWorked(st.s0.adminCommand({shardCollection: ns, key: {x: 1}}));
+ assert.commandWorked(st.s0.adminCommand({split: ns, middle: {x: 0}}));
+
+ flushRoutersAndRefreshShardMetadata(st, {ns});
+};
+
+let getOpsBlockedByRefresh = () => {
+ return assert.commandWorked(st.s1.adminCommand({serverStatus: 1}))
+ .shardingStatistics.catalogCache.operationsBlockedByRefresh;
+};
+
+let verifyBlockedOperationsChange = (oldOperationsCount, increasedOps) => {
+ let newOperationsCount = getOpsBlockedByRefresh();
+ increasedOps.forEach(op => {
+ assert.eq(newOperationsCount[op.opType], oldOperationsCount[op.opType] + op.increase);
+ });
+};
+
+let getShardToTargetForMoveChunk = () => {
+ const chunkDocs = configDB.chunks.find({ns: ns}).toArray();
+ const shardChunkBounds = chunkBoundsUtil.findShardChunkBounds(chunkDocs);
+ const shardThatOwnsChunk = chunkBoundsUtil.findShardForShardKey(st, shardChunkBounds, {x: 100});
+ return st.getOther(shardThatOwnsChunk).shardName;
+};
+
+let runTest = (operationToRunFn, expectedOpIncreases) => {
+ let opsBlockedByRefresh = getOpsBlockedByRefresh();
+
+ // Move chunk to ensure stale shard version for the next operation.
+ assert.commandWorked(
+ st.s0.adminCommand({moveChunk: ns, find: {x: 100}, to: getShardToTargetForMoveChunk()}));
+
+ operationToRunFn();
+ verifyBlockedOperationsChange(opsBlockedByRefresh, expectedOpIncreases);
+};
+
+setUp();
+
+/**
+ * Verify that insert operations get logged when blocked by a refresh.
+ */
+runTest(() => assert.commandWorked(mongos1Coll.insert({x: 250})), [
+ {opType: 'countAllOperations', increase: 2},
+ {opType: 'countInserts', increase: 1},
+ {opType: 'countCommands', increase: 1}
+]);
+
+/**
+ * Verify that queries get logged when blocked by a refresh.
+ */
+runTest(() => mongos1Coll.findOne({x: 250}),
+ [{opType: 'countAllOperations', increase: 1}, {opType: 'countQueries', increase: 1}]);
+
+/**
+ * Verify that updates get logged when blocked by a refresh.
+ */
+runTest(() => assert.commandWorked(mongos1Coll.update({x: 250}, {$set: {a: 1}})), [
+ {opType: 'countAllOperations', increase: 2},
+ {opType: 'countUpdates', increase: 1},
+ {opType: 'countCommands', increase: 1}
+]);
+
+/**
+ * Verify that deletes get logged when blocked by a refresh.
+ */
+runTest(() => assert.commandWorked(mongos1Coll.remove({x: 250})), [
+ {opType: 'countAllOperations', increase: 2},
+ {opType: 'countDeletes', increase: 1},
+ {opType: 'countCommands', increase: 1}
+]);
+
+/**
+ * Verify that non-CRUD commands get logged when blocked by a refresh.
+ */
+runTest(() => assert.commandWorked(mongos1DB.runCommand({collStats: collName})), [
+ {opType: 'countAllOperations', increase: 1},
+ {opType: 'countCommands', increase: 1},
+]);
+
+st.stop();
+})();
diff --git a/jstests/sharding/libs/chunk_bounds_util.js b/jstests/sharding/libs/chunk_bounds_util.js
new file mode 100644
index 00000000000..c6d91725558
--- /dev/null
+++ b/jstests/sharding/libs/chunk_bounds_util.js
@@ -0,0 +1,74 @@
+/*
+ * Utilities for dealing with chunk bounds.
+ */
+var chunkBoundsUtil = (function() {
+ let _gte = function(shardKeyA, shardKeyB) {
+ return bsonWoCompare(shardKeyA, shardKeyB) >= 0;
+ };
+
+ let _lt = function(shardKeyA, shardKeyB) {
+ return bsonWoCompare(shardKeyA, shardKeyB) < 0;
+ };
+
+ let containsKey = function(shardKey, minKey, maxKey) {
+ return _gte(shardKey, minKey) && _lt(shardKey, maxKey);
+ };
+
+ /*
+ * Returns a object mapping each shard name to an array of chunk bounds
+ * that it owns.
+ */
+ let findShardChunkBounds = function(chunkDocs) {
+ let allBounds = {};
+ for (let chunkDoc of chunkDocs) {
+ let bounds = [chunkDoc.min, chunkDoc.max];
+
+ if (!(chunkDoc.shard in allBounds)) {
+ allBounds[chunkDoc.shard] = [bounds];
+ } else {
+ allBounds[chunkDoc.shard].push(bounds);
+ }
+ }
+ return allBounds;
+ };
+
+ /*
+ * Returns the corresponding shard object for the given shard name.
+ */
+ let _getShard = function(st, shardName) {
+ for (let i = 0; i < st._connections.length; i++) {
+ if (st._connections[i].shardName == shardName) {
+ return st._connections[i];
+ }
+ }
+ };
+
+ /*
+ * Returns the shard object for the shard that owns the chunk that contains
+ * the given shard key value and the bounds of the chunk.
+ */
+ let findShardAndChunkBoundsForShardKey = function(st, shardChunkBounds, shardKey) {
+ for (const [shardName, chunkBounds] of Object.entries(shardChunkBounds)) {
+ for (let bounds of chunkBounds) {
+ if (containsKey(shardKey, bounds[0], bounds[1])) {
+ return {shard: _getShard(st, shardName), bounds: bounds};
+ }
+ }
+ }
+ };
+
+ /*
+ * Returns the shard object for the shard that owns the chunk that contains
+ * the given shard key value.
+ */
+ let findShardForShardKey = function(st, shardChunkBounds, shardKey) {
+ return findShardAndChunkBoundsForShardKey(st, shardChunkBounds, shardKey).shard;
+ };
+
+ return {
+ containsKey,
+ findShardChunkBounds,
+ findShardAndChunkBoundsForShardKey,
+ findShardForShardKey
+ };
+})(); \ No newline at end of file
diff --git a/src/mongo/s/catalog_cache.cpp b/src/mongo/s/catalog_cache.cpp
index 918e2cb1f2e..e7f8b5d9603 100644
--- a/src/mongo/s/catalog_cache.cpp
+++ b/src/mongo/s/catalog_cache.cpp
@@ -49,6 +49,9 @@
#include "mongo/util/timer.h"
namespace mongo {
+const OperationContext::Decoration<bool> operationBlockedBehindCatalogCacheRefresh =
+ OperationContext::declareDecoration<bool>();
+
namespace {
// How many times to try refreshing the routing info if the set of chunks loaded from the config
@@ -232,6 +235,9 @@ CatalogCache::RefreshResult CatalogCache::_getCollectionRoutingInfoAt(
auto& collEntry = itColl->second;
if (collEntry->needsRefresh) {
+
+ operationBlockedBehindCatalogCacheRefresh(opCtx) = true;
+
auto refreshNotification = collEntry->refreshCompletionNotification;
if (!refreshNotification) {
refreshNotification = (collEntry->refreshCompletionNotification =
@@ -502,6 +508,37 @@ void CatalogCache::report(BSONObjBuilder* builder) const {
_stats.report(&cacheStatsBuilder);
}
+void CatalogCache::checkAndRecordOperationBlockedByRefresh(OperationContext* opCtx,
+ mongo::LogicalOp opType) {
+ if (!operationBlockedBehindCatalogCacheRefresh(opCtx)) {
+ return;
+ }
+
+ auto& opsBlockedByRefresh = _stats.operationsBlockedByRefresh;
+
+ opsBlockedByRefresh.countAllOperations.fetchAndAddRelaxed(1);
+
+ switch (opType) {
+ case LogicalOp::opInsert:
+ opsBlockedByRefresh.countInserts.fetchAndAddRelaxed(1);
+ break;
+ case LogicalOp::opQuery:
+ opsBlockedByRefresh.countQueries.fetchAndAddRelaxed(1);
+ break;
+ case LogicalOp::opUpdate:
+ opsBlockedByRefresh.countUpdates.fetchAndAddRelaxed(1);
+ break;
+ case LogicalOp::opDelete:
+ opsBlockedByRefresh.countDeletes.fetchAndAddRelaxed(1);
+ break;
+ case LogicalOp::opCommand:
+ opsBlockedByRefresh.countCommands.fetchAndAddRelaxed(1);
+ break;
+ default:
+ MONGO_UNREACHABLE;
+ }
+}
+
void CatalogCache::_scheduleDatabaseRefresh(WithLock lk,
const std::string& dbName,
std::shared_ptr<DatabaseInfoEntry> dbEntry) {
@@ -717,6 +754,24 @@ void CatalogCache::Stats::report(BSONObjBuilder* builder) const {
builder->append("countFullRefreshesStarted", countFullRefreshesStarted.load());
builder->append("countFailedRefreshes", countFailedRefreshes.load());
+
+ BSONObjBuilder operationsBlockedByRefreshBuilder(
+ builder->subobjStart("operationsBlockedByRefresh"));
+
+ operationsBlockedByRefreshBuilder.append("countAllOperations",
+ operationsBlockedByRefresh.countAllOperations.load());
+ operationsBlockedByRefreshBuilder.append("countInserts",
+ operationsBlockedByRefresh.countInserts.load());
+ operationsBlockedByRefreshBuilder.append("countQueries",
+ operationsBlockedByRefresh.countQueries.load());
+ operationsBlockedByRefreshBuilder.append("countUpdates",
+ operationsBlockedByRefresh.countUpdates.load());
+ operationsBlockedByRefreshBuilder.append("countDeletes",
+ operationsBlockedByRefresh.countDeletes.load());
+ operationsBlockedByRefreshBuilder.append("countCommands",
+ operationsBlockedByRefresh.countCommands.load());
+
+ operationsBlockedByRefreshBuilder.done();
}
CachedDatabaseInfo::CachedDatabaseInfo(DatabaseType dbt, std::shared_ptr<Shard> primaryShard)
diff --git a/src/mongo/s/catalog_cache.h b/src/mongo/s/catalog_cache.h
index 310c90e66bd..582554514c0 100644
--- a/src/mongo/s/catalog_cache.h
+++ b/src/mongo/s/catalog_cache.h
@@ -255,6 +255,12 @@ public:
*/
void report(BSONObjBuilder* builder) const;
+ /**
+ * Checks if the current operation was ever marked as needing refresh. If the curent operation
+ * was marked as needing refresh, updates the relevant counters inside the Stats struct.
+ */
+ void checkAndRecordOperationBlockedByRefresh(OperationContext* opCtx, mongo::LogicalOp opType);
+
private:
// Make the cache entries friends so they can access the private classes below
friend class CachedDatabaseInfo;
@@ -381,6 +387,18 @@ private:
// for whatever reason
AtomicWord<long long> countFailedRefreshes{0};
+ // Cumulative, always-increasing counter of how many operations have been blocked by a
+ // catalog cache refresh. Broken down by operation type to match the operations tracked
+ // by the OpCounters class.
+ struct OperationsBlockedByRefresh {
+ AtomicWord<long long> countAllOperations{0};
+ AtomicWord<long long> countInserts{0};
+ AtomicWord<long long> countQueries{0};
+ AtomicWord<long long> countUpdates{0};
+ AtomicWord<long long> countDeletes{0};
+ AtomicWord<long long> countCommands{0};
+ } operationsBlockedByRefresh;
+
/**
* Reports the accumulated statistics for serverStatus.
*/
diff --git a/src/mongo/s/commands/cluster_find_cmd.cpp b/src/mongo/s/commands/cluster_find_cmd.cpp
index e133875439b..8fdc3d5af07 100644
--- a/src/mongo/s/commands/cluster_find_cmd.cpp
+++ b/src/mongo/s/commands/cluster_find_cmd.cpp
@@ -198,6 +198,11 @@ public:
// We count find command as a query op.
globalOpCounters.gotQuery();
+ ON_BLOCK_EXIT([opCtx] {
+ Grid::get(opCtx)->catalogCache()->checkAndRecordOperationBlockedByRefresh(
+ opCtx, mongo::LogicalOp::opQuery);
+ });
+
const bool isExplain = false;
auto qr = parseCmdObjectToQueryRequest(opCtx, ns(), _request.body, isExplain);
diff --git a/src/mongo/s/commands/cluster_write_cmd.cpp b/src/mongo/s/commands/cluster_write_cmd.cpp
index e3dee43a5b6..75649cb1775 100644
--- a/src/mongo/s/commands/cluster_write_cmd.cpp
+++ b/src/mongo/s/commands/cluster_write_cmd.cpp
@@ -474,17 +474,22 @@ private:
// TODO: increase opcounters by more than one
auto& debug = CurOp::get(opCtx)->debug();
+ auto catalogCache = Grid::get(opCtx)->catalogCache();
switch (_batchedRequest.getBatchType()) {
case BatchedCommandRequest::BatchType_Insert:
for (size_t i = 0; i < numAttempts; ++i) {
globalOpCounters.gotInsert();
}
+ catalogCache->checkAndRecordOperationBlockedByRefresh(opCtx,
+ mongo::LogicalOp::opInsert);
debug.additiveMetrics.ninserted = response.getN();
break;
case BatchedCommandRequest::BatchType_Update:
for (size_t i = 0; i < numAttempts; ++i) {
globalOpCounters.gotUpdate();
}
+ catalogCache->checkAndRecordOperationBlockedByRefresh(opCtx,
+ mongo::LogicalOp::opUpdate);
debug.upsert = response.isUpsertDetailsSet();
debug.additiveMetrics.nMatched =
response.getN() - (debug.upsert ? response.sizeUpsertDetails() : 0);
@@ -504,6 +509,8 @@ private:
for (size_t i = 0; i < numAttempts; ++i) {
globalOpCounters.gotDelete();
}
+ catalogCache->checkAndRecordOperationBlockedByRefresh(opCtx,
+ mongo::LogicalOp::opDelete);
debug.additiveMetrics.ndeleted = response.getN();
break;
}
diff --git a/src/mongo/s/commands/strategy.cpp b/src/mongo/s/commands/strategy.cpp
index 76e2d0d2fc9..def36c7a210 100644
--- a/src/mongo/s/commands/strategy.cpp
+++ b/src/mongo/s/commands/strategy.cpp
@@ -237,10 +237,19 @@ void execCommandClient(OperationContext* opCtx,
c->incrementCommandsExecuted();
- if (c->shouldAffectCommandCounter()) {
+ auto shouldAffectCommandCounter = c->shouldAffectCommandCounter();
+
+ if (shouldAffectCommandCounter) {
globalOpCounters.gotCommand();
}
+ ON_BLOCK_EXIT([opCtx, shouldAffectCommandCounter] {
+ if (shouldAffectCommandCounter) {
+ Grid::get(opCtx)->catalogCache()->checkAndRecordOperationBlockedByRefresh(
+ opCtx, mongo::LogicalOp::opCommand);
+ }
+ });
+
auto wcResult = uassertStatusOK(WriteConcernOptions::extractWCFromCommand(request.body));
bool supportsWriteConcern = invocation->supportsWriteConcern();
@@ -621,6 +630,11 @@ void runCommand(OperationContext* opCtx,
DbResponse Strategy::queryOp(OperationContext* opCtx, const NamespaceString& nss, DbMessage* dbm) {
globalOpCounters.gotQuery();
+ ON_BLOCK_EXIT([opCtx] {
+ Grid::get(opCtx)->catalogCache()->checkAndRecordOperationBlockedByRefresh(
+ opCtx, mongo::LogicalOp::opQuery);
+ });
+
const QueryMessage q(*dbm);
const auto upconvertedQuery = upconvertQueryEntry(q.query, nss, q.ntoreturn, q.ntoskip);