From ff5e2fc2fde5edf9ccd7c9814734651996f45851 Mon Sep 17 00:00:00 2001 From: Louis Williams Date: Tue, 28 Mar 2023 07:29:27 +0000 Subject: SERVER-75277 Clear resource consumption metrics before each TTL index pass (cherry picked from commit 8e744478f8569c1ead0ffe7b725c89983c886ac6) --- etc/backports_required_for_multiversion_tests.yml | 4 + .../ttl_operation_metrics_multi_dbs.js | 148 +++++++++++++++++++++ .../db/stats/resource_consumption_metrics.cpp | 17 +++ src/mongo/db/stats/resource_consumption_metrics.h | 4 +- .../db/stats/resource_consumption_metrics_test.cpp | 61 +++++++++ 5 files changed, 233 insertions(+), 1 deletion(-) create mode 100644 jstests/noPassthrough/ttl_operation_metrics_multi_dbs.js diff --git a/etc/backports_required_for_multiversion_tests.yml b/etc/backports_required_for_multiversion_tests.yml index f8f1b94de15..bb2261f573c 100644 --- a/etc/backports_required_for_multiversion_tests.yml +++ b/etc/backports_required_for_multiversion_tests.yml @@ -324,6 +324,8 @@ last-continuous: ticket: SERVER-73482 - test_file: jstests/replsets/session_cache_refresh_write_error_fail.js ticket: SERVER-73229 + - test_file: jstests/noPassthrough/ttl_operation_metrics_multi_dbs.js + ticket: SERVER-75277 suites: null last-lts: all: @@ -719,4 +721,6 @@ last-lts: ticket: SERVER-73482 - test_file: jstests/replsets/session_cache_refresh_write_error_fail.js ticket: SERVER-73229 + - test_file: jstests/noPassthrough/ttl_operation_metrics_multi_dbs.js + ticket: SERVER-75277 suites: null diff --git a/jstests/noPassthrough/ttl_operation_metrics_multi_dbs.js b/jstests/noPassthrough/ttl_operation_metrics_multi_dbs.js new file mode 100644 index 00000000000..e6e1bfefdf3 --- /dev/null +++ b/jstests/noPassthrough/ttl_operation_metrics_multi_dbs.js @@ -0,0 +1,148 @@ +/** + * Tests resource consumption metrics for TTL indexes on multiple databases. + * + * @tags: [ + * requires_replication, + * ] + */ +(function() { +'use strict'; + +load('jstests/noPassthrough/libs/index_build.js'); // For IndexBuildTest +load("jstests/libs/fail_point_util.js"); +load("jstests/libs/ttl_util.js"); + +var rst = new ReplSetTest({ + nodes: 2, + nodeOptions: { + setParameter: { + "aggregateOperationResourceConsumptionMetrics": true, + "ttlMonitorSleepSecs": 1, + } + } +}); +rst.startSet(); +rst.initiate(); + +const dbName1 = 'db1'; +const dbName2 = 'db2'; +const collName = 'coll'; +const primary = rst.getPrimary(); +const secondary = rst.getSecondary(); +const primaryDB1 = primary.getDB(dbName1); +const primaryDB2 = primary.getDB(dbName2); + +const clearMetrics = (conn) => { + conn.getDB('admin').aggregate([{$operationMetrics: {clearMetrics: true}}]); +}; + +// Get aggregated metrics keyed by database name. +const getMetrics = (conn) => { + const cursor = conn.getDB('admin').aggregate([{$operationMetrics: {}}]); + + let allMetrics = {}; + while (cursor.hasNext()) { + let doc = cursor.next(); + allMetrics[doc.db] = doc; + } + return allMetrics; +}; + +const assertMetrics = (conn, assertFn) => { + let metrics = getMetrics(conn); + try { + assertFn(metrics); + } catch (e) { + print("caught exception while checking metrics on " + tojson(conn) + + ", metrics: " + tojson(metrics)); + throw e; + } +}; + +// Create identical TTL indexes on both databases with immediate expiry. +assert.commandWorked(primaryDB1[collName].createIndex({x: 1}, {expireAfterSeconds: 0})); +assert.commandWorked(primaryDB2[collName].createIndex({x: 1}, {expireAfterSeconds: 0})); + +const pauseTtl = configureFailPoint(primary, 'hangTTLMonitorWithLock'); +pauseTtl.wait(); + +clearMetrics(primary); + +// On DB 1 we expect all documents to be deleted. +const now = new Date(); +assert.commandWorked(primaryDB1[collName].insert({_id: 0, x: now})); +assert.commandWorked(primaryDB1[collName].insert({_id: 1, x: now})); + +// On DB2 we expect no documents to be deleted. +const later = new Date(now.getTime() + 1000 * 60 * 60); +assert.commandWorked(primaryDB2[collName].insert({_id: 0, x: later})); +assert.commandWorked(primaryDB2[collName].insert({_id: 1, x: later})); + +assertMetrics(primary, (metrics) => { + // With replication enabled, oplog writes are counted towards bytes written. Only assert that we + // insert at least as many bytes in the documents. + // Document size is 29 bytes. + assert.gte(metrics[dbName1].docBytesWritten, 29 * 2); + assert.gte(metrics[dbName1].docUnitsWritten, 2); + assert.gte(metrics[dbName1].totalUnitsWritten, 2); + + assert.gte(metrics[dbName2].docBytesWritten, 29 * 2); + assert.gte(metrics[dbName2].docUnitsWritten, 2); + assert.gte(metrics[dbName2].totalUnitsWritten, 2); +}); + +// Clear metrics and wait for a TTL pass to delete the documents. +clearMetrics(primary); +pauseTtl.off(); +TTLUtil.waitForPass(primaryDB1); + +// Ensure that the TTL monitor deleted 2 documents on the primary and recorded read and write +// metrics. +assertMetrics(primary, (metrics) => { + // The TTL monitor generates oplog entries for each deletion on the primary. Assert that we + // write at least as many bytes in the documents. Document size is 29 bytes. + assert.gte(metrics[dbName1].primaryMetrics.docBytesRead, 29 * 2); + assert.gte(metrics[dbName1].primaryMetrics.docUnitsRead, 2); + assert.gte(metrics[dbName1].docBytesWritten, 29 * 2); + assert.gte(metrics[dbName1].docUnitsWritten, 2); + assert.gte(metrics[dbName1].totalUnitsWritten, 2); + // Key size is 12 bytes. + assert.gte(metrics[dbName1].primaryMetrics.idxEntryBytesRead, 12 * 2); + assert.gte(metrics[dbName1].primaryMetrics.idxEntryUnitsRead, 2); + // At least 2 keys (_id and x) should be deleted for each document. + assert.gte(metrics[dbName1].idxEntryUnitsWritten, 2 * 2); + assert.gte(metrics[dbName1].idxEntryBytesWritten, 12 * 2); + + assert.eq(metrics[dbName2].primaryMetrics.docBytesRead, 0); + assert.eq(metrics[dbName2].primaryMetrics.docUnitsRead, 0); + assert.eq(metrics[dbName2].docBytesWritten, 0); + assert.eq(metrics[dbName2].docUnitsWritten, 0); + assert.eq(metrics[dbName2].totalUnitsWritten, 0); + + // We need to read in a few keys to determine whether there is data to delete. Since we haven't + // stopped the TTL monitor, the value can be larger than expected. + assert.gte(metrics[dbName2].primaryMetrics.idxEntryBytesRead, 24); + assert.gte(metrics[dbName2].primaryMetrics.idxEntryUnitsRead, 2); + assert.eq(metrics[dbName2].idxEntryUnitsWritten, 0); + assert.eq(metrics[dbName2].idxEntryBytesWritten, 0); +}); + +rst.awaitReplication(); + +// There should be no activity on the secondary. +assertMetrics(secondary, (metrics) => { + assert(!metrics.hasOwnProperty(dbName1)); + assert(!metrics.hasOwnProperty(dbName2)); +}); + +// Ensure the correct documents were deleted. +assert.eq(primaryDB1[collName].count({}), 0); +assert.eq(primaryDB2[collName].count({}), 2); + +const secondaryDB1 = secondary.getDB(dbName1); +const secondaryDB2 = secondary.getDB(dbName2); +assert.eq(secondaryDB1[collName].count({}), 0); +assert.eq(secondaryDB2[collName].count({}), 2); + +rst.stopSet(); +}()); diff --git a/src/mongo/db/stats/resource_consumption_metrics.cpp b/src/mongo/db/stats/resource_consumption_metrics.cpp index 1fccd04fc66..bedfcdb3f2f 100644 --- a/src/mongo/db/stats/resource_consumption_metrics.cpp +++ b/src/mongo/db/stats/resource_consumption_metrics.cpp @@ -211,6 +211,12 @@ void ResourceConsumption::OperationMetrics::toBson(BSONObjBuilder* builder) cons } } +BSONObj ResourceConsumption::OperationMetrics::toBson() const { + BSONObjBuilder builder; + toBson(&builder); + return builder.obj(); +} + void ResourceConsumption::OperationMetrics::toBsonNonZeroFields(BSONObjBuilder* builder) const { appendNonZeroMetric(builder, kDocBytesRead, readMetrics.docsRead.bytes()); appendNonZeroMetric(builder, kDocUnitsRead, readMetrics.docsRead.units()); @@ -328,6 +334,11 @@ void ResourceConsumption::MetricsCollector::beginScopedCollecting(OperationConte _collecting = ScopedCollectionState::kInScopeCollecting; _hasCollectedMetrics = true; + // We must clear the metrics here to ensure we do not accumulate metrics from previous scoped + // collections. Note that we can't clear metrics in endScopedCollecting() because consumers + // expect metrics to be available after a scoped collection period has ended. + _metrics = {}; + // The OperationCPUTimer may be nullptr on unsupported systems. _metrics.cpuTimer = OperationCPUTimer::get(opCtx); if (_metrics.cpuTimer) { @@ -408,6 +419,12 @@ void ResourceConsumption::merge(OperationContext* opCtx, const OperationMetrics& metrics) { invariant(!dbName.empty()); + LOGV2_DEBUG(7527700, + 1, + "ResourceConsumption::merge", + "dbName"_attr = dbName, + "metrics"_attr = metrics.toBson()); + // All metrics over the duration of this operation will be attributed to the current state, even // if it ran accross state transitions. // The RSTL is normally required to check the replication state, but callers may not always be diff --git a/src/mongo/db/stats/resource_consumption_metrics.h b/src/mongo/db/stats/resource_consumption_metrics.h index f41042b29ab..19b3c67c234 100644 --- a/src/mongo/db/stats/resource_consumption_metrics.h +++ b/src/mongo/db/stats/resource_consumption_metrics.h @@ -219,6 +219,7 @@ public: * Reports all metrics on a BSONObjBuilder. */ void toBson(BSONObjBuilder* builder) const; + BSONObj toBson() const; /** * Reports metrics on a BSONObjBuilder. Only non-zero fields are reported. @@ -280,7 +281,8 @@ public: static MetricsCollector& get(OperationContext* opCtx); /** - * When called, resource consumption metrics should be recorded for this operation. + * When called, resource consumption metrics should be recorded for this operation. Clears + * any metrics from previous collection periods. */ void beginScopedCollecting(OperationContext* opCtx, const std::string& dbName); diff --git a/src/mongo/db/stats/resource_consumption_metrics_test.cpp b/src/mongo/db/stats/resource_consumption_metrics_test.cpp index 9c992bca916..155bb688e11 100644 --- a/src/mongo/db/stats/resource_consumption_metrics_test.cpp +++ b/src/mongo/db/stats/resource_consumption_metrics_test.cpp @@ -762,4 +762,65 @@ TEST_F(ResourceConsumptionMetricsTest, PauseMetricsCollectorBlock) { ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.cursorSeeks, 1); } +TEST_F(ResourceConsumptionMetricsTest, ResetMetricsBetweenCollection) { + auto& globalResourceConsumption = ResourceConsumption::get(getServiceContext()); + auto& operationMetrics = ResourceConsumption::MetricsCollector::get(_opCtx.get()); + + { + ResourceConsumption::ScopedMetricsCollector scope(_opCtx.get(), "db1"); + + operationMetrics.incrementOneDocRead("", 2); + operationMetrics.incrementOneIdxEntryRead("", 4); + operationMetrics.incrementKeysSorted(8); + operationMetrics.incrementSorterSpills(16); + operationMetrics.incrementDocUnitsReturned("", makeDocUnits(32)); + operationMetrics.incrementOneCursorSeek(""); + } + + auto metricsCopy = globalResourceConsumption.getAndClearDbMetrics(); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.docsRead.bytes(), 2); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.docsRead.units(), 1); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.idxEntriesRead.bytes(), 4); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.idxEntriesRead.units(), 1); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.keysSorted, 8); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.sorterSpills, 16); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.docsReturned.bytes(), 32); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.docsReturned.units(), 1); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.cursorSeeks, 1); + + // We expect this metrics collection to wipe out the metrics from the previous one. + { + ResourceConsumption::ScopedMetricsCollector scope(_opCtx.get(), "db2"); + operationMetrics.incrementOneDocRead("", 64); + operationMetrics.incrementOneIdxEntryRead("", 128); + operationMetrics.incrementKeysSorted(256); + operationMetrics.incrementSorterSpills(512); + operationMetrics.incrementDocUnitsReturned("", makeDocUnits(1024)); + operationMetrics.incrementOneCursorSeek(""); + } + + metricsCopy = globalResourceConsumption.getDbMetrics(); + + // We should not have any of the metrics from the first collection. + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.docsRead.bytes(), 0); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.docsRead.units(), 0); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.idxEntriesRead.bytes(), 0); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.idxEntriesRead.units(), 0); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.keysSorted, 0); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.sorterSpills, 0); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.docsReturned.bytes(), 0); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.docsReturned.units(), 0); + ASSERT_EQ(metricsCopy["db1"].primaryReadMetrics.cursorSeeks, 0); + + ASSERT_EQ(metricsCopy["db2"].primaryReadMetrics.docsRead.bytes(), 64); + ASSERT_EQ(metricsCopy["db2"].primaryReadMetrics.docsRead.units(), 1); + ASSERT_EQ(metricsCopy["db2"].primaryReadMetrics.idxEntriesRead.bytes(), 128); + ASSERT_EQ(metricsCopy["db2"].primaryReadMetrics.idxEntriesRead.units(), 8); + ASSERT_EQ(metricsCopy["db2"].primaryReadMetrics.keysSorted, 256); + ASSERT_EQ(metricsCopy["db2"].primaryReadMetrics.sorterSpills, 512); + ASSERT_EQ(metricsCopy["db2"].primaryReadMetrics.docsReturned.bytes(), 1024); + ASSERT_EQ(metricsCopy["db2"].primaryReadMetrics.docsReturned.units(), 8); + ASSERT_EQ(metricsCopy["db2"].primaryReadMetrics.cursorSeeks, 1); +} + } // namespace mongo -- cgit v1.2.1