diff options
author | Blake Oler <blake.oler@mongodb.com> | 2020-04-08 16:26:45 -0400 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-04-16 14:33:32 +0000 |
commit | 7545d9ffa5f7c980a1619d419e8fc03647e86cc5 (patch) | |
tree | 051fd3d0658cd86f6955efd2f969613eaa5561c8 | |
parent | 20364840b8f1af16917e4c23c1b5f5efd8b352f8 (diff) | |
download | mongo-7545d9ffa5f7c980a1619d419e8fc03647e86cc5.tar.gz |
SERVER-46396 Add metrics to track number of operations blocked behind a catalog cache refresh
(cherry picked from commit 8d136e00f5a204609403075ccd96fceb8d6e8ea7)
(chunk_bounds_util.js cherry picked from commit aefce507386045fe6a77eebcbec567000a6a98e5
authored by Cheahuychou Mao (cheahuychou on Github))
-rw-r--r-- | buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml | 1 | ||||
-rw-r--r-- | jstests/sharding/catalog_cache_refresh_counters.js | 111 | ||||
-rw-r--r-- | jstests/sharding/libs/chunk_bounds_util.js | 74 | ||||
-rw-r--r-- | src/mongo/s/catalog_cache.cpp | 55 | ||||
-rw-r--r-- | src/mongo/s/catalog_cache.h | 18 | ||||
-rw-r--r-- | src/mongo/s/commands/cluster_find_cmd.cpp | 5 | ||||
-rw-r--r-- | src/mongo/s/commands/cluster_write_cmd.cpp | 7 | ||||
-rw-r--r-- | src/mongo/s/commands/strategy.cpp | 16 |
8 files changed, 286 insertions, 1 deletions
diff --git a/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml b/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml index 266f193a29c..b04b769e8ac 100644 --- a/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml +++ b/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml @@ -62,6 +62,7 @@ selector: - jstests/sharding/agg_write_stages_cannot_run_on_mongos.js # Enable when SERVER-40435 is backported to last stable 4.0 - jstests/sharding/clear_jumbo.js + - jstests/sharding/catalog_cache_refresh_counters.js - jstests/sharding/explain_agg_read_pref.js - jstests/sharding/explain_cmd.js - jstests/sharding/failcommand_failpoint_not_parallel.js diff --git a/jstests/sharding/catalog_cache_refresh_counters.js b/jstests/sharding/catalog_cache_refresh_counters.js new file mode 100644 index 00000000000..67a237d881e --- /dev/null +++ b/jstests/sharding/catalog_cache_refresh_counters.js @@ -0,0 +1,111 @@ +/** + * Test that operations that get blocked by catalog cache refreshes get logged to + * shardingStatistics. + */ + +(function() { +'use strict'; + +load('jstests/sharding/libs/sharded_transactions_helpers.js'); +load("jstests/sharding/libs/chunk_bounds_util.js"); + +let st = new ShardingTest({mongos: 2, shards: 2}); +const configDB = st.s.getDB('config'); +const dbName = "test"; +const collName = "foo"; +const ns = dbName + "." + collName; +const mongos0DB = st.s0.getDB(dbName); +const mongos1DB = st.s1.getDB(dbName); +const mongos0Coll = mongos0DB[collName]; +const mongos1Coll = mongos1DB[collName]; + +let setUp = () => { + /** + * Sets up a test by moving chunks to such that one chunk is on each + * shard, with the following distribution: + * shard0: [-inf, 0) + * shard1: [0, inf) + */ + assert.commandWorked(st.s0.adminCommand({enableSharding: dbName})); + assert.commandWorked(st.s0.adminCommand({movePrimary: dbName, to: st.shard0.shardName})); + assert.commandWorked(st.s0.adminCommand({shardCollection: ns, key: {x: 1}})); + assert.commandWorked(st.s0.adminCommand({split: ns, middle: {x: 0}})); + + flushRoutersAndRefreshShardMetadata(st, {ns}); +}; + +let getOpsBlockedByRefresh = () => { + return assert.commandWorked(st.s1.adminCommand({serverStatus: 1})) + .shardingStatistics.catalogCache.operationsBlockedByRefresh; +}; + +let verifyBlockedOperationsChange = (oldOperationsCount, increasedOps) => { + let newOperationsCount = getOpsBlockedByRefresh(); + increasedOps.forEach(op => { + assert.eq(newOperationsCount[op.opType], oldOperationsCount[op.opType] + op.increase); + }); +}; + +let getShardToTargetForMoveChunk = () => { + const chunkDocs = configDB.chunks.find({ns: ns}).toArray(); + const shardChunkBounds = chunkBoundsUtil.findShardChunkBounds(chunkDocs); + const shardThatOwnsChunk = chunkBoundsUtil.findShardForShardKey(st, shardChunkBounds, {x: 100}); + return st.getOther(shardThatOwnsChunk).shardName; +}; + +let runTest = (operationToRunFn, expectedOpIncreases) => { + let opsBlockedByRefresh = getOpsBlockedByRefresh(); + + // Move chunk to ensure stale shard version for the next operation. + assert.commandWorked( + st.s0.adminCommand({moveChunk: ns, find: {x: 100}, to: getShardToTargetForMoveChunk()})); + + operationToRunFn(); + verifyBlockedOperationsChange(opsBlockedByRefresh, expectedOpIncreases); +}; + +setUp(); + +/** + * Verify that insert operations get logged when blocked by a refresh. + */ +runTest(() => assert.commandWorked(mongos1Coll.insert({x: 250})), [ + {opType: 'countAllOperations', increase: 2}, + {opType: 'countInserts', increase: 1}, + {opType: 'countCommands', increase: 1} +]); + +/** + * Verify that queries get logged when blocked by a refresh. + */ +runTest(() => mongos1Coll.findOne({x: 250}), + [{opType: 'countAllOperations', increase: 1}, {opType: 'countQueries', increase: 1}]); + +/** + * Verify that updates get logged when blocked by a refresh. + */ +runTest(() => assert.commandWorked(mongos1Coll.update({x: 250}, {$set: {a: 1}})), [ + {opType: 'countAllOperations', increase: 2}, + {opType: 'countUpdates', increase: 1}, + {opType: 'countCommands', increase: 1} +]); + +/** + * Verify that deletes get logged when blocked by a refresh. + */ +runTest(() => assert.commandWorked(mongos1Coll.remove({x: 250})), [ + {opType: 'countAllOperations', increase: 2}, + {opType: 'countDeletes', increase: 1}, + {opType: 'countCommands', increase: 1} +]); + +/** + * Verify that non-CRUD commands get logged when blocked by a refresh. + */ +runTest(() => assert.commandWorked(mongos1DB.runCommand({collStats: collName})), [ + {opType: 'countAllOperations', increase: 1}, + {opType: 'countCommands', increase: 1}, +]); + +st.stop(); +})(); diff --git a/jstests/sharding/libs/chunk_bounds_util.js b/jstests/sharding/libs/chunk_bounds_util.js new file mode 100644 index 00000000000..c6d91725558 --- /dev/null +++ b/jstests/sharding/libs/chunk_bounds_util.js @@ -0,0 +1,74 @@ +/* + * Utilities for dealing with chunk bounds. + */ +var chunkBoundsUtil = (function() { + let _gte = function(shardKeyA, shardKeyB) { + return bsonWoCompare(shardKeyA, shardKeyB) >= 0; + }; + + let _lt = function(shardKeyA, shardKeyB) { + return bsonWoCompare(shardKeyA, shardKeyB) < 0; + }; + + let containsKey = function(shardKey, minKey, maxKey) { + return _gte(shardKey, minKey) && _lt(shardKey, maxKey); + }; + + /* + * Returns a object mapping each shard name to an array of chunk bounds + * that it owns. + */ + let findShardChunkBounds = function(chunkDocs) { + let allBounds = {}; + for (let chunkDoc of chunkDocs) { + let bounds = [chunkDoc.min, chunkDoc.max]; + + if (!(chunkDoc.shard in allBounds)) { + allBounds[chunkDoc.shard] = [bounds]; + } else { + allBounds[chunkDoc.shard].push(bounds); + } + } + return allBounds; + }; + + /* + * Returns the corresponding shard object for the given shard name. + */ + let _getShard = function(st, shardName) { + for (let i = 0; i < st._connections.length; i++) { + if (st._connections[i].shardName == shardName) { + return st._connections[i]; + } + } + }; + + /* + * Returns the shard object for the shard that owns the chunk that contains + * the given shard key value and the bounds of the chunk. + */ + let findShardAndChunkBoundsForShardKey = function(st, shardChunkBounds, shardKey) { + for (const [shardName, chunkBounds] of Object.entries(shardChunkBounds)) { + for (let bounds of chunkBounds) { + if (containsKey(shardKey, bounds[0], bounds[1])) { + return {shard: _getShard(st, shardName), bounds: bounds}; + } + } + } + }; + + /* + * Returns the shard object for the shard that owns the chunk that contains + * the given shard key value. + */ + let findShardForShardKey = function(st, shardChunkBounds, shardKey) { + return findShardAndChunkBoundsForShardKey(st, shardChunkBounds, shardKey).shard; + }; + + return { + containsKey, + findShardChunkBounds, + findShardAndChunkBoundsForShardKey, + findShardForShardKey + }; +})();
\ No newline at end of file diff --git a/src/mongo/s/catalog_cache.cpp b/src/mongo/s/catalog_cache.cpp index 918e2cb1f2e..e7f8b5d9603 100644 --- a/src/mongo/s/catalog_cache.cpp +++ b/src/mongo/s/catalog_cache.cpp @@ -49,6 +49,9 @@ #include "mongo/util/timer.h" namespace mongo { +const OperationContext::Decoration<bool> operationBlockedBehindCatalogCacheRefresh = + OperationContext::declareDecoration<bool>(); + namespace { // How many times to try refreshing the routing info if the set of chunks loaded from the config @@ -232,6 +235,9 @@ CatalogCache::RefreshResult CatalogCache::_getCollectionRoutingInfoAt( auto& collEntry = itColl->second; if (collEntry->needsRefresh) { + + operationBlockedBehindCatalogCacheRefresh(opCtx) = true; + auto refreshNotification = collEntry->refreshCompletionNotification; if (!refreshNotification) { refreshNotification = (collEntry->refreshCompletionNotification = @@ -502,6 +508,37 @@ void CatalogCache::report(BSONObjBuilder* builder) const { _stats.report(&cacheStatsBuilder); } +void CatalogCache::checkAndRecordOperationBlockedByRefresh(OperationContext* opCtx, + mongo::LogicalOp opType) { + if (!operationBlockedBehindCatalogCacheRefresh(opCtx)) { + return; + } + + auto& opsBlockedByRefresh = _stats.operationsBlockedByRefresh; + + opsBlockedByRefresh.countAllOperations.fetchAndAddRelaxed(1); + + switch (opType) { + case LogicalOp::opInsert: + opsBlockedByRefresh.countInserts.fetchAndAddRelaxed(1); + break; + case LogicalOp::opQuery: + opsBlockedByRefresh.countQueries.fetchAndAddRelaxed(1); + break; + case LogicalOp::opUpdate: + opsBlockedByRefresh.countUpdates.fetchAndAddRelaxed(1); + break; + case LogicalOp::opDelete: + opsBlockedByRefresh.countDeletes.fetchAndAddRelaxed(1); + break; + case LogicalOp::opCommand: + opsBlockedByRefresh.countCommands.fetchAndAddRelaxed(1); + break; + default: + MONGO_UNREACHABLE; + } +} + void CatalogCache::_scheduleDatabaseRefresh(WithLock lk, const std::string& dbName, std::shared_ptr<DatabaseInfoEntry> dbEntry) { @@ -717,6 +754,24 @@ void CatalogCache::Stats::report(BSONObjBuilder* builder) const { builder->append("countFullRefreshesStarted", countFullRefreshesStarted.load()); builder->append("countFailedRefreshes", countFailedRefreshes.load()); + + BSONObjBuilder operationsBlockedByRefreshBuilder( + builder->subobjStart("operationsBlockedByRefresh")); + + operationsBlockedByRefreshBuilder.append("countAllOperations", + operationsBlockedByRefresh.countAllOperations.load()); + operationsBlockedByRefreshBuilder.append("countInserts", + operationsBlockedByRefresh.countInserts.load()); + operationsBlockedByRefreshBuilder.append("countQueries", + operationsBlockedByRefresh.countQueries.load()); + operationsBlockedByRefreshBuilder.append("countUpdates", + operationsBlockedByRefresh.countUpdates.load()); + operationsBlockedByRefreshBuilder.append("countDeletes", + operationsBlockedByRefresh.countDeletes.load()); + operationsBlockedByRefreshBuilder.append("countCommands", + operationsBlockedByRefresh.countCommands.load()); + + operationsBlockedByRefreshBuilder.done(); } CachedDatabaseInfo::CachedDatabaseInfo(DatabaseType dbt, std::shared_ptr<Shard> primaryShard) diff --git a/src/mongo/s/catalog_cache.h b/src/mongo/s/catalog_cache.h index 310c90e66bd..582554514c0 100644 --- a/src/mongo/s/catalog_cache.h +++ b/src/mongo/s/catalog_cache.h @@ -255,6 +255,12 @@ public: */ void report(BSONObjBuilder* builder) const; + /** + * Checks if the current operation was ever marked as needing refresh. If the curent operation + * was marked as needing refresh, updates the relevant counters inside the Stats struct. + */ + void checkAndRecordOperationBlockedByRefresh(OperationContext* opCtx, mongo::LogicalOp opType); + private: // Make the cache entries friends so they can access the private classes below friend class CachedDatabaseInfo; @@ -381,6 +387,18 @@ private: // for whatever reason AtomicWord<long long> countFailedRefreshes{0}; + // Cumulative, always-increasing counter of how many operations have been blocked by a + // catalog cache refresh. Broken down by operation type to match the operations tracked + // by the OpCounters class. + struct OperationsBlockedByRefresh { + AtomicWord<long long> countAllOperations{0}; + AtomicWord<long long> countInserts{0}; + AtomicWord<long long> countQueries{0}; + AtomicWord<long long> countUpdates{0}; + AtomicWord<long long> countDeletes{0}; + AtomicWord<long long> countCommands{0}; + } operationsBlockedByRefresh; + /** * Reports the accumulated statistics for serverStatus. */ diff --git a/src/mongo/s/commands/cluster_find_cmd.cpp b/src/mongo/s/commands/cluster_find_cmd.cpp index e133875439b..8fdc3d5af07 100644 --- a/src/mongo/s/commands/cluster_find_cmd.cpp +++ b/src/mongo/s/commands/cluster_find_cmd.cpp @@ -198,6 +198,11 @@ public: // We count find command as a query op. globalOpCounters.gotQuery(); + ON_BLOCK_EXIT([opCtx] { + Grid::get(opCtx)->catalogCache()->checkAndRecordOperationBlockedByRefresh( + opCtx, mongo::LogicalOp::opQuery); + }); + const bool isExplain = false; auto qr = parseCmdObjectToQueryRequest(opCtx, ns(), _request.body, isExplain); diff --git a/src/mongo/s/commands/cluster_write_cmd.cpp b/src/mongo/s/commands/cluster_write_cmd.cpp index e3dee43a5b6..75649cb1775 100644 --- a/src/mongo/s/commands/cluster_write_cmd.cpp +++ b/src/mongo/s/commands/cluster_write_cmd.cpp @@ -474,17 +474,22 @@ private: // TODO: increase opcounters by more than one auto& debug = CurOp::get(opCtx)->debug(); + auto catalogCache = Grid::get(opCtx)->catalogCache(); switch (_batchedRequest.getBatchType()) { case BatchedCommandRequest::BatchType_Insert: for (size_t i = 0; i < numAttempts; ++i) { globalOpCounters.gotInsert(); } + catalogCache->checkAndRecordOperationBlockedByRefresh(opCtx, + mongo::LogicalOp::opInsert); debug.additiveMetrics.ninserted = response.getN(); break; case BatchedCommandRequest::BatchType_Update: for (size_t i = 0; i < numAttempts; ++i) { globalOpCounters.gotUpdate(); } + catalogCache->checkAndRecordOperationBlockedByRefresh(opCtx, + mongo::LogicalOp::opUpdate); debug.upsert = response.isUpsertDetailsSet(); debug.additiveMetrics.nMatched = response.getN() - (debug.upsert ? response.sizeUpsertDetails() : 0); @@ -504,6 +509,8 @@ private: for (size_t i = 0; i < numAttempts; ++i) { globalOpCounters.gotDelete(); } + catalogCache->checkAndRecordOperationBlockedByRefresh(opCtx, + mongo::LogicalOp::opDelete); debug.additiveMetrics.ndeleted = response.getN(); break; } diff --git a/src/mongo/s/commands/strategy.cpp b/src/mongo/s/commands/strategy.cpp index 76e2d0d2fc9..def36c7a210 100644 --- a/src/mongo/s/commands/strategy.cpp +++ b/src/mongo/s/commands/strategy.cpp @@ -237,10 +237,19 @@ void execCommandClient(OperationContext* opCtx, c->incrementCommandsExecuted(); - if (c->shouldAffectCommandCounter()) { + auto shouldAffectCommandCounter = c->shouldAffectCommandCounter(); + + if (shouldAffectCommandCounter) { globalOpCounters.gotCommand(); } + ON_BLOCK_EXIT([opCtx, shouldAffectCommandCounter] { + if (shouldAffectCommandCounter) { + Grid::get(opCtx)->catalogCache()->checkAndRecordOperationBlockedByRefresh( + opCtx, mongo::LogicalOp::opCommand); + } + }); + auto wcResult = uassertStatusOK(WriteConcernOptions::extractWCFromCommand(request.body)); bool supportsWriteConcern = invocation->supportsWriteConcern(); @@ -621,6 +630,11 @@ void runCommand(OperationContext* opCtx, DbResponse Strategy::queryOp(OperationContext* opCtx, const NamespaceString& nss, DbMessage* dbm) { globalOpCounters.gotQuery(); + ON_BLOCK_EXIT([opCtx] { + Grid::get(opCtx)->catalogCache()->checkAndRecordOperationBlockedByRefresh( + opCtx, mongo::LogicalOp::opQuery); + }); + const QueryMessage q(*dbm); const auto upconvertedQuery = upconvertQueryEntry(q.query, nss, q.ntoreturn, q.ntoskip); |