SERVER-57700 Measure latency/throughput of resharding::data_copy::fillBatchForInsert in ReshardingCollectionCloner::doOneBatch

author: Matt Walak <matt.walak@mongodb.com> 2021-08-02 19:15:01 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2021-08-12 15:41:36 +0000
commit: 98a21127ae4957594276a35228a95d8d79d6ad25 (patch)
tree: b8bc46e2045a9d81ccb82f377bd571abac65bdea /src/mongo/db/s
parent: fce8274e6d47ed28d915e8e2db5149876dbcae25 (diff)
download: mongo-98a21127ae4957594276a35228a95d8d79d6ad25.tar.gz
5 files changed, 116 insertions, 49 deletions
diff --git a/src/mongo/db/s/resharding/resharding_collection_cloner.cpp b/src/mongo/db/s/resharding/resharding_collection_cloner.cpp
index 2af6caace49..7971ca21424 100644
--- a/src/mongo/db/s/resharding/resharding_collection_cloner.cpp
+++ b/src/mongo/db/s/resharding/resharding_collection_cloner.cpp
@@ -258,8 +258,12 @@ std::unique_ptr<Pipeline, PipelineDeleter> ReshardingCollectionCloner::_restartP
 bool ReshardingCollectionCloner::doOneBatch(OperationContext* opCtx, Pipeline& pipeline) {
     pipeline.reattachToOperationContext(opCtx);
     ON_BLOCK_EXIT([&pipeline] { pipeline.detachFromOperationContext(); });
+
+    Timer latencyTimer;
     auto batch = resharding::data_copy::fillBatchForInsert(
         pipeline, resharding::gReshardingCollectionClonerBatchSizeInBytes.load());
+    _env->metrics()->onCollClonerFillBatchForInsert(
+        duration_cast<Milliseconds>(latencyTimer.elapsed()));
 
     if (batch.empty()) {
         return false;
diff --git a/src/mongo/db/s/resharding/resharding_metrics.cpp b/src/mongo/db/s/resharding/resharding_metrics.cpp
index 9cc77c64a1a..f95372e3ad2 100644
--- a/src/mongo/db/s/resharding/resharding_metrics.cpp
+++ b/src/mongo/db/s/resharding/resharding_metrics.cpp
@@ -69,7 +69,9 @@ constexpr auto kLastOpEndingChunkImbalance = "lastOpEndingChunkImbalance";
 constexpr auto kOpCounters = "opcounters";
 constexpr auto kMinRemainingOperationTime = "minShardRemainingOperationTimeEstimatedMillis";
 constexpr auto kMaxRemainingOperationTime = "maxShardRemainingOperationTimeEstimatedMillis";
-constexpr auto kOplogBatchApplyLatencyMillis = "oplogBatchApplyLatencyMillis";
+constexpr auto kOplogApplierApplyBatchLatencyMillis = "oplogApplierApplyBatchLatencyMillis";
+constexpr auto kCollClonerFillBatchForInsertLatencyMillis =
+    "collClonerFillBatchForInsertLatencyMillis";
 
 using MetricsPtr = std::unique_ptr<ReshardingMetrics>;
 
@@ -93,6 +95,13 @@ Milliseconds remainingTime(Milliseconds elapsedTime, double elapsedWork, double
     return Milliseconds(Milliseconds::rep(remainingMsec));
 }
 
+void appendHistogram(BSONObjBuilder* bob,
+                     const IntegerHistogram<kLatencyHistogramBucketsCount>& hist) {
+    BSONObjBuilder histogramBuilder;
+    hist.append(histogramBuilder, false);
+    bob->appendElements(histogramBuilder.obj());
+}
+
 static StringData serializeState(boost::optional<RecipientStateEnum> e) {
     return RecipientState_serializer(*e);
 }
@@ -220,8 +229,11 @@ public:
 
     int64_t chunkImbalanceCount = 0;
 
-    IntegerHistogram<kLatencyHistogramBucketsCount> oplogBatchApplyLatencyMillis =
-        IntegerHistogram<kLatencyHistogramBucketsCount>(kOplogBatchApplyLatencyMillis,
+    IntegerHistogram<kLatencyHistogramBucketsCount> oplogApplierApplyBatchLatencyMillis =
+        IntegerHistogram<kLatencyHistogramBucketsCount>(kOplogApplierApplyBatchLatencyMillis,
+                                                        latencyHistogramBuckets);
+    IntegerHistogram<kLatencyHistogramBucketsCount> collClonerFillBatchForInsertLatencyMillis =
+        IntegerHistogram<kLatencyHistogramBucketsCount>(kCollClonerFillBatchForInsertLatencyMillis,
                                                         latencyHistogramBuckets);
 
     // The ops done by resharding to keep up with the client writes.
@@ -306,13 +318,8 @@ void ReshardingMetrics::OperationMetrics::appendCurrentOpMetrics(BSONObjBuilder*
                         serializeState(recipientState.get_value_or(RecipientStateEnum::kUnused)));
             bob->append(kOpStatus, ReshardingOperationStatus_serializer(opStatus));
 
-            {
-                BSONObjBuilder histogramBuilder;
-                oplogBatchApplyLatencyMillis.append(histogramBuilder, false);
-                BSONObj histogram = histogramBuilder.obj();
-                bob->appendElements(histogram);
-            }
-
+            appendHistogram(bob, oplogApplierApplyBatchLatencyMillis);
+            appendHistogram(bob, collClonerFillBatchForInsertLatencyMillis);
             break;
         case Role::kCoordinator:
             bob->append(kCoordinatorState, serializeState(coordinatorState));
@@ -573,14 +580,27 @@ void ReshardingMetrics::gotInserts(int n) noexcept {
     _cumulativeOp->gotInserts(n);
 }
 
-void ReshardingMetrics::onApplyOplogBatch(Milliseconds latency) {
+void ReshardingMetrics::onOplogApplierApplyBatch(Milliseconds latency) {
     stdx::lock_guard<Latch> lk(_mutex);
     invariant(_currentOp, kNoOperationInProgress);
     invariant(checkState(*_currentOp->recipientState,
                          {RecipientStateEnum::kApplying, RecipientStateEnum::kError}));
 
-    _currentOp->oplogBatchApplyLatencyMillis.increment(durationCount<Milliseconds>(latency));
-    _cumulativeOp->oplogBatchApplyLatencyMillis.increment(durationCount<Milliseconds>(latency));
+    _currentOp->oplogApplierApplyBatchLatencyMillis.increment(durationCount<Milliseconds>(latency));
+    _cumulativeOp->oplogApplierApplyBatchLatencyMillis.increment(
+        durationCount<Milliseconds>(latency));
+}
+
+void ReshardingMetrics::onCollClonerFillBatchForInsert(Milliseconds latency) {
+    stdx::lock_guard<Latch> lk(_mutex);
+    invariant(_currentOp, kNoOperationInProgress);
+    invariant(checkState(*_currentOp->recipientState,
+                         {RecipientStateEnum::kCloning, RecipientStateEnum::kError}));
+
+    _currentOp->collClonerFillBatchForInsertLatencyMillis.increment(
+        durationCount<Milliseconds>(latency));
+    _cumulativeOp->collClonerFillBatchForInsertLatencyMillis.increment(
+        durationCount<Milliseconds>(latency));
 }
 
 void ReshardingMetrics::gotInsert() noexcept {
@@ -761,12 +781,8 @@ void ReshardingMetrics::serializeCumulativeOpMetrics(BSONObjBuilder* bob) const
     bob->append(kMaxRemainingOperationTime,
                 getRemainingOperationTime(ops.maxRemainingOperationTime));
 
-    {
-        BSONObjBuilder histogramBuilder;
-        ops.oplogBatchApplyLatencyMillis.append(histogramBuilder, false);
-        BSONObj histogram = histogramBuilder.obj();
-        bob->appendElements(histogram);
-    }
+    appendHistogram(bob, ops.oplogApplierApplyBatchLatencyMillis);
+    appendHistogram(bob, ops.collClonerFillBatchForInsertLatencyMillis);
 }
 
 Date_t ReshardingMetrics::_now() const {
diff --git a/src/mongo/db/s/resharding/resharding_metrics.h b/src/mongo/db/s/resharding/resharding_metrics.h
index b63ef01bb9b..24ab1a9a38a 100644
--- a/src/mongo/db/s/resharding/resharding_metrics.h
+++ b/src/mongo/db/s/resharding/resharding_metrics.h
@@ -106,8 +106,12 @@ public:
     void enterCriticalSection(Date_t start);
     void leaveCriticalSection(Date_t end);
 
-    // Records latency and throughput of batch oplog applies during resharding.
-    void onApplyOplogBatch(Milliseconds latency);
+    // Records latency and throughput of calls to ReshardingOplogApplier::_applyBatch
+    void onOplogApplierApplyBatch(Milliseconds latency);
+
+    // Records latency and throughput of calls to resharding::data_copy::fillBatchForInsert
+    // in ReshardingCollectionCloner::doOneBatch
+    void onCollClonerFillBatchForInsert(Milliseconds latency);
 
     // Allows updating "oplog entries to apply" metrics when the recipient is in applying state.
     void onOplogEntriesFetched(int64_t entries) noexcept;
diff --git a/src/mongo/db/s/resharding/resharding_metrics_test.cpp b/src/mongo/db/s/resharding/resharding_metrics_test.cpp
index 77b8be78fb2..4b979936f51 100644
--- a/src/mongo/db/s/resharding/resharding_metrics_test.cpp
+++ b/src/mongo/db/s/resharding/resharding_metrics_test.cpp
@@ -49,7 +49,9 @@ namespace {
 using namespace fmt::literals;
 
 constexpr auto kOpTimeRemaining = "remainingOperationTimeEstimatedSecs"_sd;
-constexpr auto kOplogBatchApplyLatencyMillis = "oplogBatchApplyLatencyMillis";
+constexpr auto kOplogApplierApplyBatchLatencyMillis = "oplogApplierApplyBatchLatencyMillis";
+constexpr auto kCollClonerFillBatchForInsertLatencyMillis =
+    "collClonerFillBatchForInsertLatencyMillis";
 
 class ReshardingMetricsTest : public ServiceContextTest {
 public:
@@ -129,15 +131,18 @@ public:
             << fmt::format("{}: {}", errMsg, report.toString());
     };
 
-    void appendExpectedHistogramResult(BSONObjBuilder& builder,
+    void appendExpectedHistogramResult(BSONObjBuilder* bob,
+                                       std::string tag,
                                        const std::vector<int64_t>& latencies) {
-        IntegerHistogram<kLatencyHistogramBucketsCount> hist(kOplogBatchApplyLatencyMillis,
-                                                             latencyHistogramBuckets);
+        IntegerHistogram<kLatencyHistogramBucketsCount> hist(tag, latencyHistogramBuckets);
         for (size_t i = 0; i < latencies.size(); i++) {
             hist.increment(latencies[i]);
         }
 
-        hist.append(builder, false);
+        BSONObjBuilder histogramBuilder;
+        hist.append(histogramBuilder, false);
+        BSONObj histogram = histogramBuilder.obj();
+        bob->appendElements(histogram);
     }
 
 private:
@@ -589,21 +594,34 @@ TEST_F(ReshardingMetricsTest, CurrentOpReportForRecipient) {
 
     BSONObjBuilder expectedBuilder(std::move(expectedPrefix));
 
-    // Append histogram data for batch oplog applies.
-    appendExpectedHistogramResult(expectedBuilder, {});
-    BSONObj expected = expectedBuilder.obj();
+    // Append histogram latency data.
+    appendExpectedHistogramResult(&expectedBuilder, kOplogApplierApplyBatchLatencyMillis, {});
+    appendExpectedHistogramResult(&expectedBuilder, kCollClonerFillBatchForInsertLatencyMillis, {});
+
+    BSONObj expected = expectedBuilder.done();
 
     const auto report = getMetrics()->reportForCurrentOp(options);
     ASSERT_BSONOBJ_EQ(expected, report);
 }
 
-TEST_F(ReshardingMetricsTest, TestOplogBatchLatencyHistogramForRecipient) {
-    const std::vector<int64_t> latencies_1{3, 427, 0, 6004, 320, 10056, 12300, 105, 70};
-    const std::vector<int64_t> latencies_2{800, 20, 5, 1025, 10567};
+TEST_F(ReshardingMetricsTest, TestHistogramMetricsForRecipient) {
+    const std::vector<int64_t> applyLatencies_1{3, 427, 0, 6004, 320, 10056, 12300, 105, 70};
+    const std::vector<int64_t> applyLatencies_2{800, 20, 5, 1025, 10567};
+    const std::vector<int64_t> insertLatencies_1{120, 7, 110, 50, 0, 16500, 77000, 667, 7980};
+    const std::vector<int64_t> insertLatencies_2{12450, 2400, 760, 57, 2};
+
+    const auto combineLatencies = [](std::vector<int64_t>* allLatencies,
+                                     const std::vector<int64_t>& latencies_1,
+                                     const std::vector<int64_t>& latencies_2) {
+        allLatencies->insert(allLatencies->end(), latencies_1.begin(), latencies_1.end());
+        allLatencies->insert(allLatencies->end(), latencies_2.begin(), latencies_2.end());
+    };
+
+    std::vector<int64_t> allApplyLatencies;
+    combineLatencies(&allApplyLatencies, applyLatencies_1, applyLatencies_2);
+    std::vector<int64_t> allInsertLatencies;
+    combineLatencies(&allInsertLatencies, insertLatencies_1, insertLatencies_2);
 
-    std::vector<int64_t> allLatencies;
-    allLatencies.insert(allLatencies.end(), latencies_1.begin(), latencies_1.end());
-    allLatencies.insert(allLatencies.end(), latencies_2.begin(), latencies_2.end());
 
     const ReshardingMetrics::ReporterOptions options(
         ReshardingMetrics::Role::kRecipient,
@@ -612,31 +630,54 @@ TEST_F(ReshardingMetricsTest, TestOplogBatchLatencyHistogramForRecipient) {
         BSON("id" << 1),
         false);
 
-    // First test that histogram metrics appear in currentOp. Next, test that histogram metrics
-    // accumulate in cumulativeOp.
-    const size_t kNumTests = 2;
-    std::vector<int64_t> testLatencies[kNumTests] = {latencies_1, latencies_2};
-    std::vector<int64_t> expectedLatencies[kNumTests] = {latencies_1, allLatencies};
+    // Test that all histogram metrics appear in both currentOp and cumulativeOp.
+    const size_t kNumTests = 4;
+    std::vector<int64_t> testLatencies[kNumTests] = {
+        applyLatencies_1, applyLatencies_2, insertLatencies_1, insertLatencies_2};
+    std::vector<int64_t> expectedLatencies[kNumTests] = {
+        applyLatencies_1, allApplyLatencies, insertLatencies_1, allInsertLatencies};
     OpReportType testReportTypes[kNumTests] = {OpReportType::CurrentOpReportRecipientRole,
+                                               OpReportType::CumulativeReport,
+                                               OpReportType::CurrentOpReportRecipientRole,
                                                OpReportType::CumulativeReport};
+    std::string histogramTag[kNumTests] = {kOplogApplierApplyBatchLatencyMillis,
+                                           kOplogApplierApplyBatchLatencyMillis,
+                                           kCollClonerFillBatchForInsertLatencyMillis,
+                                           kCollClonerFillBatchForInsertLatencyMillis};
 
     auto testLatencyHistogram = [&](std::vector<int64_t> latencies,
                                     OpReportType reportType,
-                                    std::vector<int64_t> expectedLatencies) {
+                                    std::vector<int64_t> expectedLatencies,
+                                    std::string histogramTag) {
+        LOGV2(57700,
+              "TestHistogramMetricsForRecipient test case",
+              "reportType"_attr = reportType,
+              "histogramTag"_attr = histogramTag);
+
         startOperation(ReshardingMetrics::Role::kRecipient);
 
-        getMetrics()->setRecipientState(RecipientStateEnum::kApplying);
+        RecipientStateEnum state = (histogramTag.compare(kOplogApplierApplyBatchLatencyMillis) == 0
+                                        ? RecipientStateEnum::kApplying
+                                        : RecipientStateEnum::kCloning);
+        getMetrics()->setRecipientState(state);
+
         for (size_t i = 0; i < latencies.size(); i++) {
-            getMetrics()->onApplyOplogBatch(Milliseconds(latencies[i]));
+            if (histogramTag.compare(kOplogApplierApplyBatchLatencyMillis) == 0) {
+                getMetrics()->onOplogApplierApplyBatch(Milliseconds(latencies[i]));
+            } else if (histogramTag.compare(kCollClonerFillBatchForInsertLatencyMillis) == 0) {
+                getMetrics()->onCollClonerFillBatchForInsert(Milliseconds(latencies[i]));
+            } else {
+                MONGO_UNREACHABLE;
+            }
         }
 
         const auto report = getReport(reportType);
-        const auto buckets = report[kOplogBatchApplyLatencyMillis];
+        const auto buckets = report[histogramTag];
 
         BSONObjBuilder expectedBuilder;
-        appendExpectedHistogramResult(expectedBuilder, expectedLatencies);
-        const auto expectedHist = expectedBuilder.obj();
-        const auto expectedBuckets = expectedHist[kOplogBatchApplyLatencyMillis];
+        appendExpectedHistogramResult(&expectedBuilder, histogramTag, expectedLatencies);
+        const auto expectedHist = expectedBuilder.done();
+        const auto expectedBuckets = expectedHist[histogramTag];
 
         ASSERT_EQ(buckets.woCompare(expectedBuckets), 0);
 
@@ -645,7 +686,8 @@ TEST_F(ReshardingMetricsTest, TestOplogBatchLatencyHistogramForRecipient) {
     };
 
     for (size_t i = 0; i < kNumTests; i++) {
-        testLatencyHistogram(testLatencies[i], testReportTypes[i], expectedLatencies[i]);
+        testLatencyHistogram(
+            testLatencies[i], testReportTypes[i], expectedLatencies[i], histogramTag[i]);
     }
 }
 
diff --git a/src/mongo/db/s/resharding/resharding_oplog_applier.cpp b/src/mongo/db/s/resharding/resharding_oplog_applier.cpp
index 06ce65384c8..ad2f2e399ad 100644
--- a/src/mongo/db/s/resharding/resharding_oplog_applier.cpp
+++ b/src/mongo/db/s/resharding/resharding_oplog_applier.cpp
@@ -116,7 +116,8 @@ SemiFuture<void> ReshardingOplogApplier::_applyBatch(
             return status;
         })
         .onCompletion([this, latencyTimer](Status status) {
-            _env->metrics()->onApplyOplogBatch(duration_cast<Milliseconds>(latencyTimer.elapsed()));
+            _env->metrics()->onOplogApplierApplyBatch(
+                duration_cast<Milliseconds>(latencyTimer.elapsed()));
             return status;
         })
         .semi();
author	Matt Walak <matt.walak@mongodb.com>	2021-08-02 19:15:01 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2021-08-12 15:41:36 +0000
commit	98a21127ae4957594276a35228a95d8d79d6ad25 (patch)
tree	b8bc46e2045a9d81ccb82f377bd571abac65bdea /src/mongo/db/s
parent	fce8274e6d47ed28d915e8e2db5149876dbcae25 (diff)
download	mongo-98a21127ae4957594276a35228a95d8d79d6ad25.tar.gz