summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBilly Donahue <billy.donahue@mongodb.com>2021-02-03 01:36:11 -0500
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-02-09 04:43:08 +0000
commitc6afa9e7eb6c5f9ea7ce19c86b1658d055ec3872 (patch)
treee3541a8cfc8eb82d49a4defbccbf91af1292240b
parentdc96088c84c5411b70caa0c01be000da1a1ae3d1 (diff)
downloadmongo-c6afa9e7eb6c5f9ea7ce19c86b1658d055ec3872.tar.gz
SERVER-54227 ReshardingMetrics fix "remaining time" estimate
-rw-r--r--src/mongo/db/s/resharding/resharding_metrics.cpp46
-rw-r--r--src/mongo/db/s/resharding/resharding_metrics_test.cpp78
2 files changed, 105 insertions, 19 deletions
diff --git a/src/mongo/db/s/resharding/resharding_metrics.cpp b/src/mongo/db/s/resharding/resharding_metrics.cpp
index 5fabcb00e9e..9f8ada073ca 100644
--- a/src/mongo/db/s/resharding/resharding_metrics.cpp
+++ b/src/mongo/db/s/resharding/resharding_metrics.cpp
@@ -67,6 +67,20 @@ const auto getMetrics = ServiceContext::declareDecoration<MetricsPtr>();
const auto reshardingMetricsRegisterer = ServiceContext::ConstructorActionRegisterer{
"ReshardingMetrics",
[](ServiceContext* ctx) { getMetrics(ctx) = std::make_unique<ReshardingMetrics>(ctx); }};
+
+/**
+ * Given a constant rate of time per unit of work:
+ * totalTime / totalWork == elapsedTime / elapsedWork
+ * Solve for remaining time.
+ * remainingTime := totalTime - elapsedTime
+ * == (totalWork * (elapsedTime / elapsedWork)) - elapsedTime
+ * == elapsedTime * (totalWork / elapsedWork - 1)
+ */
+Milliseconds remainingTime(Milliseconds elapsedTime, double elapsedWork, double totalWork) {
+ elapsedWork = std::min(elapsedWork, totalWork);
+ double remainingMsec = 1.0 * elapsedTime.count() * (totalWork / elapsedWork - 1);
+ return Milliseconds(Milliseconds::rep(remainingMsec));
+}
} // namespace
ReshardingMetrics* ReshardingMetrics::get(ServiceContext* ctx) noexcept {
@@ -242,28 +256,28 @@ void ReshardingMetrics::OperationMetrics::append(BSONObjBuilder* bob, Role role)
return durationCount<Seconds>(interval.duration());
};
- auto estimateRemainingOperationTime = [&]() -> int64_t {
- if (bytesCopied == 0 && oplogEntriesApplied == 0)
- return -1;
- else if (oplogEntriesApplied == 0) {
- invariant(bytesCopied > 0);
+ auto remainingMsec = [&]() -> boost::optional<Milliseconds> {
+ if (oplogEntriesApplied > 0) {
+ // All fetched oplogEntries must be applied. Some of them already have been.
+ return remainingTime(
+ applyingOplogEntries.duration(), oplogEntriesApplied, oplogEntriesFetched);
+ }
+ if (bytesCopied > 0) {
// Until the time to apply batches of oplog entries is measured, we assume that applying
// all of them will take as long as copying did.
- const auto elapsedCopyTime = getElapsedTime(copyingDocuments);
- const auto approxTimeToCopy =
- elapsedCopyTime * std::max((int64_t)0, bytesToCopy / bytesCopied - 1);
- return elapsedCopyTime + 2 * approxTimeToCopy;
- } else {
- invariant(oplogEntriesApplied > 0);
- const auto approxTimeToApply = getElapsedTime(applyingOplogEntries) *
- std::max((int64_t)0, oplogEntriesFetched / oplogEntriesApplied - 1);
- return approxTimeToApply;
+ return remainingTime(copyingDocuments.duration(), bytesCopied, 2 * bytesToCopy);
}
- };
+ return {};
+ }();
+
const std::string kIntervalSuffix = role == Role::kAll ? "Millis" : "";
bob->append(kOpTimeElapsed + kIntervalSuffix, getElapsedTime(runningOperation));
- bob->append(kOpTimeRemaining + kIntervalSuffix, estimateRemainingOperationTime());
+
+ bob->append(kOpTimeRemaining + kIntervalSuffix,
+ !remainingMsec ? int64_t{-1} /** -1 is a specified integer null value */
+ : role == Role::kAll ? durationCount<Milliseconds>(*remainingMsec)
+ : durationCount<Seconds>(*remainingMsec));
if (role == Role::kAll || role == Role::kRecipient) {
bob->append(kDocumentsToCopy, documentsToCopy);
diff --git a/src/mongo/db/s/resharding/resharding_metrics_test.cpp b/src/mongo/db/s/resharding/resharding_metrics_test.cpp
index 798e7c2a103..b4e65ea0f31 100644
--- a/src/mongo/db/s/resharding/resharding_metrics_test.cpp
+++ b/src/mongo/db/s/resharding/resharding_metrics_test.cpp
@@ -27,22 +27,30 @@
* it in the license file.
*/
+#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kTest
+
#include <fmt/format.h>
#include "mongo/bson/bsonobjbuilder.h"
#include "mongo/bson/json.h"
#include "mongo/db/s/resharding/resharding_metrics.h"
#include "mongo/db/service_context_test_fixture.h"
+#include "mongo/logv2/log.h"
#include "mongo/unittest/death_test.h"
#include "mongo/unittest/unittest.h"
#include "mongo/util/clock_source_mock.h"
#include "mongo/util/uuid.h"
namespace mongo {
+namespace {
+
+using namespace fmt::literals;
+
+constexpr auto kOpTimeRemaining = "remainingOperationTimeEstimatedMillis"_sd;
class ReshardingMetricsTest : public ServiceContextTest {
public:
- void setUp() {
+ void setUp() override {
auto clockSource = std::make_unique<ClockSourceMock>();
_clockSource = clockSource.get();
getGlobalServiceContext()->setFastClockSource(std::move(clockSource));
@@ -55,8 +63,8 @@ public:
// Timer step in milliseconds
static constexpr auto kTimerStep = 100;
- void advanceTime(Milliseconds interval = Milliseconds(kTimerStep)) {
- _clockSource->advance(interval);
+ void advanceTime(Milliseconds step = Milliseconds{kTimerStep}) {
+ _clockSource->advance(step);
}
auto getReport() {
@@ -371,4 +379,68 @@ TEST_F(ReshardingMetricsTest, CurrentOpReportForCoordinator) {
ASSERT_BSONOBJ_EQ(expected, report);
}
+TEST_F(ReshardingMetricsTest, EstimatedRemainingOperationTimeCloning) {
+ // Copy N docs @ timePerDoc. Check the progression of the estimated time remaining.
+ auto m = getMetrics();
+ m->onStart();
+ m->setRecipientState(RecipientStateEnum::kCloning);
+ auto timePerDocument = Milliseconds{123};
+ int64_t bytesPerDocument = 1024;
+ int64_t documentsToCopy = 409;
+ int64_t bytesToCopy = bytesPerDocument * documentsToCopy;
+ m->setDocumentsToCopy(documentsToCopy, bytesToCopy);
+ auto remainingTime = 2 * timePerDocument * documentsToCopy;
+ double maxAbsRelErr = 0;
+ for (int64_t copied = 0; copied < documentsToCopy; ++copied) {
+ double output = getReport()[kOpTimeRemaining].Number();
+ if (copied == 0) {
+ ASSERT_EQ(output, -1);
+ } else {
+ ASSERT_GTE(output, 0);
+ auto expected = durationCount<Milliseconds>(remainingTime);
+ // Check that error is pretty small (it should get better as the operation progresses)
+ double absRelErr = std::abs((output - expected) / expected);
+ ASSERT_LT(absRelErr, 0.05)
+ << "output={}, expected={}, copied={}"_format(output, expected, copied);
+ maxAbsRelErr = std::max(maxAbsRelErr, absRelErr);
+ }
+ m->onDocumentsCopied(1, bytesPerDocument);
+ advanceTime(timePerDocument);
+ remainingTime -= timePerDocument;
+ }
+ LOGV2_DEBUG(
+ 5422700, 3, "Max absolute relative error observed", "maxAbsRelErr"_attr = maxAbsRelErr);
+}
+
+TEST_F(ReshardingMetricsTest, EstimatedRemainingOperationTimeApplying) {
+ // Perform N ops @ timePerOp. Check the progression of the estimated time remaining.
+ auto m = getMetrics();
+ m->onStart();
+ m->setRecipientState(RecipientStateEnum::kApplying);
+ auto timePerOp = Milliseconds{123};
+ int64_t fetched = 10000;
+ m->onOplogEntriesFetched(fetched);
+ auto remainingTime = timePerOp * fetched;
+ double maxAbsRelErr = 0;
+ for (int64_t applied = 0; applied < fetched; ++applied) {
+ double output = getReport()[kOpTimeRemaining].Number();
+ if (applied == 0) {
+ ASSERT_EQ(output, -1);
+ } else {
+ auto expected = durationCount<Milliseconds>(remainingTime);
+ // Check that error is pretty small (it should get better as the operation progresses)
+ double absRelErr = std::abs((output - expected) / expected);
+ ASSERT_LT(absRelErr, 0.05)
+ << "output={}, expected={}, applied={}"_format(output, expected, applied);
+ maxAbsRelErr = std::max(maxAbsRelErr, absRelErr);
+ }
+ advanceTime(timePerOp);
+ m->onOplogEntriesApplied(1);
+ remainingTime -= timePerOp;
+ }
+ LOGV2_DEBUG(
+ 5422701, 3, "Max absolute relative error observed", "maxAbsRelErr"_attr = maxAbsRelErr);
+}
+
+} // namespace
} // namespace mongo