summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Russotto <matthew.russotto@mongodb.com>2020-01-21 21:27:42 +0000
committerevergreen <evergreen@mongodb.com>2020-01-21 21:27:42 +0000
commitfd317b4f262e66624975e42f77b7c9bddc47fbbe (patch)
tree4a27505c755581825576312944884d6bb4ef8b40
parent5d5ab0e0d20f65872f6e006835593bfb75f63905 (diff)
downloadmongo-fd317b4f262e66624975e42f77b7c9bddc47fbbe.tar.gz
SERVER-43280 Implement new initial sync statistics
-rw-r--r--src/mongo/db/repl/initial_sync_shared_data.h8
-rw-r--r--src/mongo/db/repl/initial_syncer.cpp34
-rw-r--r--src/mongo/db/repl/initial_syncer.h3
-rw-r--r--src/mongo/db/repl/initial_syncer_test.cpp161
4 files changed, 197 insertions, 9 deletions
diff --git a/src/mongo/db/repl/initial_sync_shared_data.h b/src/mongo/db/repl/initial_sync_shared_data.h
index 2324d1864a5..ffe2aa3a948 100644
--- a/src/mongo/db/repl/initial_sync_shared_data.h
+++ b/src/mongo/db/repl/initial_sync_shared_data.h
@@ -115,6 +115,14 @@ public:
}
/**
+ * Returns the time the current outage (if any) began. Returns Date_t() if no outage in
+ * progress.
+ */
+ Date_t getSyncSourceUnreachableSince(WithLock lk) {
+ return _syncSourceUnreachableSince;
+ }
+
+ /**
* shouldRetryOperation() is the interface for retries. For each retryable operation, declare a
* RetryableOperation which is passed to this method. When the operation succeeds, destroy the
* RetryableOperation (outside the lock) or assign boost::none to it.
diff --git a/src/mongo/db/repl/initial_syncer.cpp b/src/mongo/db/repl/initial_syncer.cpp
index 14f624c99b7..08c54f5738f 100644
--- a/src/mongo/db/repl/initial_syncer.cpp
+++ b/src/mongo/db/repl/initial_syncer.cpp
@@ -392,6 +392,17 @@ void InitialSyncer::_appendInitialSyncProgressMinimal_inlock(BSONObjBuilder* bob
if (!_initialSyncState->stopTimestamp.isNull()) {
bob->append("initialSyncOplogEnd", _initialSyncState->stopTimestamp);
}
+ if (_sharedData) {
+ stdx::lock_guard<InitialSyncSharedData> sdLock(*_sharedData);
+ auto unreachableSince = _sharedData->getSyncSourceUnreachableSince(sdLock);
+ if (unreachableSince != Date_t()) {
+ bob->append("syncSourceUnreachableSince", unreachableSince);
+ bob->append("currentOutageDurationMillis",
+ durationCount<Milliseconds>(_sharedData->getCurrentOutageDuration(sdLock)));
+ }
+ bob->append("totalTimeUnreachableMillis",
+ durationCount<Milliseconds>(_sharedData->getTotalTimeUnreachable(sdLock)));
+ }
}
BSONObj InitialSyncer::_getInitialSyncProgress_inlock() const {
@@ -1464,8 +1475,24 @@ void InitialSyncer::_finishInitialSyncAttempt(const StatusWith<OpTimeAndWallTime
log() << "Initial Sync Attempt Statistics: " << redact(_getInitialSyncProgress_inlock());
auto runTime = _initialSyncState ? _initialSyncState->timer.millis() : 0;
+ int rollBackId = -1;
+ int operationsRetried = 0;
+ int totalTimeUnreachableMillis = 0;
+ if (_sharedData) {
+ stdx::lock_guard<InitialSyncSharedData> sdLock(*_sharedData);
+ rollBackId = _sharedData->getRollBackId();
+ operationsRetried = _sharedData->getTotalRetries(sdLock);
+ totalTimeUnreachableMillis =
+ durationCount<Milliseconds>(_sharedData->getTotalTimeUnreachable(sdLock));
+ }
+
_stats.initialSyncAttemptInfos.emplace_back(
- InitialSyncer::InitialSyncAttemptInfo{runTime, result.getStatus(), _syncSource});
+ InitialSyncer::InitialSyncAttemptInfo{runTime,
+ result.getStatus(),
+ _syncSource,
+ rollBackId,
+ operationsRetried,
+ totalTimeUnreachableMillis});
if (MONGO_unlikely(failAndHangInitialSync.shouldFail())) {
log() << "failAndHangInitialSync fail point enabled.";
@@ -1883,6 +1910,11 @@ void InitialSyncer::InitialSyncAttemptInfo::append(BSONObjBuilder* builder) cons
builder->appendNumber("durationMillis", durationMillis);
builder->append("status", status.toString());
builder->append("syncSource", syncSource.toString());
+ if (rollBackId >= 0) {
+ builder->append("rollBackId", rollBackId);
+ }
+ builder->append("operationsRetried", operationsRetried);
+ builder->append("totalTimeUnreachableMillis", totalTimeUnreachableMillis);
}
bool InitialSyncer::OplogFetcherRestartDecisionInitialSyncer::shouldContinue(
diff --git a/src/mongo/db/repl/initial_syncer.h b/src/mongo/db/repl/initial_syncer.h
index d7f13f0ee16..399fdd57c54 100644
--- a/src/mongo/db/repl/initial_syncer.h
+++ b/src/mongo/db/repl/initial_syncer.h
@@ -156,6 +156,9 @@ public:
int durationMillis;
Status status;
HostAndPort syncSource;
+ int rollBackId;
+ int operationsRetried;
+ int totalTimeUnreachableMillis;
std::string toString() const;
BSONObj toBSON() const;
diff --git a/src/mongo/db/repl/initial_syncer_test.cpp b/src/mongo/db/repl/initial_syncer_test.cpp
index 18fc37d9dd9..494b9c5aba3 100644
--- a/src/mongo/db/repl/initial_syncer_test.cpp
+++ b/src/mongo/db/repl/initial_syncer_test.cpp
@@ -4597,7 +4597,7 @@ TEST_F(InitialSyncerTest, GetInitialSyncProgressReturnsCorrectProgress) {
auto progress = initialSyncer->getInitialSyncProgress();
log() << "Progress after first failed response: " << progress;
- ASSERT_EQUALS(progress.nFields(), 7) << progress;
+ ASSERT_EQUALS(progress.nFields(), 8) << progress;
ASSERT_EQUALS(progress.getIntField("failedInitialSyncAttempts"), 0) << progress;
ASSERT_EQUALS(progress.getIntField("maxFailedInitialSyncAttempts"), 2) << progress;
ASSERT_EQUALS(progress["initialSyncStart"].type(), Date) << progress;
@@ -4605,6 +4605,7 @@ TEST_F(InitialSyncerTest, GetInitialSyncProgressReturnsCorrectProgress) {
ASSERT_BSONOBJ_EQ(progress.getObjectField("initialSyncAttempts"), BSONObj());
ASSERT_EQUALS(progress.getIntField("appliedOps"), 0) << progress;
ASSERT_BSONOBJ_EQ(progress.getObjectField("databases"), BSON("databasesCloned" << 0));
+ ASSERT_EQUALS(progress.getIntField("totalTimeUnreachableMillis"), 0);
// Inject the listDatabases failure.
_mockServer->setCommandReply(
@@ -4652,18 +4653,19 @@ TEST_F(InitialSyncerTest, GetInitialSyncProgressReturnsCorrectProgress) {
auto progress = initialSyncer->getInitialSyncProgress();
log() << "Progress after failure: " << progress;
- ASSERT_EQUALS(progress.nFields(), 7) << progress;
+ ASSERT_EQUALS(progress.nFields(), 8) << progress;
ASSERT_EQUALS(progress.getIntField("failedInitialSyncAttempts"), 1) << progress;
ASSERT_EQUALS(progress.getIntField("maxFailedInitialSyncAttempts"), 2) << progress;
ASSERT_EQUALS(progress["initialSyncStart"].type(), Date) << progress;
ASSERT_EQUALS(progress["initialSyncOplogStart"].timestamp(), Timestamp(1, 1)) << progress;
ASSERT_EQUALS(progress.getIntField("appliedOps"), 0) << progress;
ASSERT_BSONOBJ_EQ(progress.getObjectField("databases"), BSON("databasesCloned" << 0));
+ ASSERT_EQUALS(progress.getIntField("totalTimeUnreachableMillis"), 0);
BSONObj attempts = progress["initialSyncAttempts"].Obj();
ASSERT_EQUALS(attempts.nFields(), 1) << attempts;
BSONObj attempt0 = attempts["0"].Obj();
- ASSERT_EQUALS(attempt0.nFields(), 3) << attempt0;
+ ASSERT_EQUALS(attempt0.nFields(), 6) << attempt0;
const std::string expectedlistDatabaseFailure =
"FailedToParse: error cloning databases :: caused by :: Command 'listDatabases' "
"failed.";
@@ -4674,6 +4676,9 @@ TEST_F(InitialSyncerTest, GetInitialSyncProgressReturnsCorrectProgress) {
ASSERT_EQUALS(attempt0["durationMillis"].type(), NumberInt) << attempt0;
ASSERT_EQUALS(attempt0.getStringField("syncSource"), std::string("localhost:27017"))
<< attempt0;
+ ASSERT_EQUALS(attempt0.getIntField("rollBackId"), 1) << attempt0;
+ ASSERT_EQUALS(attempt0.getIntField("operationsRetried"), 0) << attempt0;
+ ASSERT_EQUALS(attempt0.getIntField("totalTimeUnreachableMillis"), 0) << attempt0;
// Set up the successful cloner run.
// listDatabases: a
@@ -4753,12 +4758,13 @@ TEST_F(InitialSyncerTest, GetInitialSyncProgressReturnsCorrectProgress) {
auto progress = initialSyncer->getInitialSyncProgress();
log() << "Progress after all but last successful response: " << progress;
- ASSERT_EQUALS(progress.nFields(), 8) << progress;
+ ASSERT_EQUALS(progress.nFields(), 9) << progress;
ASSERT_EQUALS(progress.getIntField("failedInitialSyncAttempts"), 1) << progress;
ASSERT_EQUALS(progress.getIntField("maxFailedInitialSyncAttempts"), 2) << progress;
ASSERT_EQUALS(progress["initialSyncOplogStart"].timestamp(), Timestamp(1, 1)) << progress;
ASSERT_EQUALS(progress["initialSyncOplogEnd"].timestamp(), Timestamp(7, 1)) << progress;
ASSERT_EQUALS(progress["initialSyncStart"].type(), Date) << progress;
+ ASSERT_EQUALS(progress.getIntField("totalTimeUnreachableMillis"), 0) << progress;
// Expected applied ops to be a superset of this range: Timestamp(2,1) ... Timestamp(7,1).
ASSERT_GREATER_THAN_OR_EQUALS(progress.getIntField("appliedOps"), 6) << progress;
auto databasesProgress = progress.getObjectField("databases");
@@ -4779,7 +4785,7 @@ TEST_F(InitialSyncerTest, GetInitialSyncProgressReturnsCorrectProgress) {
auto attempts = progress["initialSyncAttempts"].Obj();
ASSERT_EQUALS(attempts.nFields(), 1) << progress;
auto attempt0 = attempts["0"].Obj();
- ASSERT_EQUALS(attempt0.nFields(), 3) << attempt0;
+ ASSERT_EQUALS(attempt0.nFields(), 6) << attempt0;
const std::string expectedlistDatabaseFailure =
"FailedToParse: error cloning databases :: caused by :: Command 'listDatabases' failed.";
ASSERT_EQUALS(std::string(attempt0.getStringField("status"))
@@ -4789,6 +4795,9 @@ TEST_F(InitialSyncerTest, GetInitialSyncProgressReturnsCorrectProgress) {
ASSERT_EQUALS(attempt0["durationMillis"].type(), NumberInt) << attempt0;
ASSERT_EQUALS(attempt0.getStringField("syncSource"), std::string("localhost:27017"))
<< attempt0;
+ ASSERT_EQUALS(attempt0.getIntField("rollBackId"), 1) << attempt0;
+ ASSERT_EQUALS(attempt0.getIntField("operationsRetried"), 0) << attempt0;
+ ASSERT_EQUALS(attempt0.getIntField("totalTimeUnreachableMillis"), 0) << attempt0;
// Play last successful response.
{
@@ -4816,7 +4825,7 @@ TEST_F(InitialSyncerTest, GetInitialSyncProgressReturnsCorrectProgress) {
progress = initialSyncer->getInitialSyncProgress();
log() << "Progress at end: " << progress;
- ASSERT_EQUALS(progress.nFields(), 10) << progress;
+ ASSERT_EQUALS(progress.nFields(), 11) << progress;
ASSERT_EQUALS(progress.getIntField("failedInitialSyncAttempts"), 1) << progress;
ASSERT_EQUALS(progress.getIntField("maxFailedInitialSyncAttempts"), 2) << progress;
ASSERT_EQUALS(progress["initialSyncStart"].type(), Date) << progress;
@@ -4824,6 +4833,7 @@ TEST_F(InitialSyncerTest, GetInitialSyncProgressReturnsCorrectProgress) {
ASSERT_EQUALS(progress["initialSyncOplogStart"].timestamp(), Timestamp(1, 1)) << progress;
ASSERT_EQUALS(progress["initialSyncOplogEnd"].timestamp(), Timestamp(7, 1)) << progress;
ASSERT_EQUALS(progress["initialSyncElapsedMillis"].type(), NumberInt) << progress;
+ ASSERT_EQUALS(progress.getIntField("totalTimeUnreachableMillis"), 0) << progress;
// Expected applied ops to be a superset of this range: Timestamp(2,1) ... Timestamp(7,1).
ASSERT_GREATER_THAN_OR_EQUALS(progress.getIntField("appliedOps"), 6) << progress;
@@ -4831,7 +4841,7 @@ TEST_F(InitialSyncerTest, GetInitialSyncProgressReturnsCorrectProgress) {
ASSERT_EQUALS(attempts.nFields(), 2) << attempts;
attempt0 = attempts["0"].Obj();
- ASSERT_EQUALS(attempt0.nFields(), 3) << attempt0;
+ ASSERT_EQUALS(attempt0.nFields(), 6) << attempt0;
ASSERT_EQUALS(std::string(attempt0.getStringField("status"))
.substr(0, expectedlistDatabaseFailure.length()),
expectedlistDatabaseFailure)
@@ -4839,13 +4849,148 @@ TEST_F(InitialSyncerTest, GetInitialSyncProgressReturnsCorrectProgress) {
ASSERT_EQUALS(attempt0["durationMillis"].type(), NumberInt) << attempt0;
ASSERT_EQUALS(attempt0.getStringField("syncSource"), std::string("localhost:27017"))
<< attempt0;
+ ASSERT_EQUALS(attempt0.getIntField("rollBackId"), 1) << attempt0;
+ ASSERT_EQUALS(attempt0.getIntField("operationsRetried"), 0) << attempt0;
+ ASSERT_EQUALS(attempt0.getIntField("totalTimeUnreachableMillis"), 0) << attempt0;
BSONObj attempt1 = attempts["1"].Obj();
- ASSERT_EQUALS(attempt1.nFields(), 3) << attempt1;
+ ASSERT_EQUALS(attempt1.nFields(), 6) << attempt1;
ASSERT_EQUALS(attempt1.getStringField("status"), std::string("OK")) << attempt1;
ASSERT_EQUALS(attempt1["durationMillis"].type(), NumberInt) << attempt1;
ASSERT_EQUALS(attempt1.getStringField("syncSource"), std::string("localhost:27017"))
<< attempt1;
+ ASSERT_EQUALS(attempt1.getIntField("rollBackId"), 1) << attempt1;
+ ASSERT_EQUALS(attempt1.getIntField("operationsRetried"), 0) << attempt1;
+ ASSERT_EQUALS(attempt1.getIntField("totalTimeUnreachableMillis"), 0) << attempt1;
+}
+
+TEST_F(InitialSyncerTest, GetInitialSyncProgressReturnsCorrectProgressForNetworkError) {
+ // Skip reconstructing prepared transactions at the end of initial sync because
+ // InitialSyncerTest does not construct ServiceEntryPoint and this causes a segmentation fault
+ // when reconstructPreparedTransactions uses DBDirectClient to call into ServiceEntryPoint.
+ FailPointEnableBlock skipReconstructPreparedTransactions("skipReconstructPreparedTransactions");
+
+ // Skip clearing initial sync progress so that we can check initialSyncStatus fields after
+ // initial sync is complete.
+ FailPointEnableBlock skipClearInitialSyncState("skipClearInitialSyncState");
+
+ auto initialSyncer = &getInitialSyncer();
+ auto opCtx = makeOpCtx();
+
+ _syncSourceSelector->setChooseNewSyncSourceResult_forTest(HostAndPort("localhost", 12345));
+ ASSERT_OK(initialSyncer->startup(opCtx.get(), maxAttempts));
+
+ auto net = getNet();
+ {
+ executor::NetworkInterfaceMock::InNetworkGuard guard(net);
+
+ // Base rollback ID.
+ net->scheduleSuccessfulResponse(makeRollbackCheckerResponse(1));
+
+ // Oplog entry associated with the defaultBeginFetchingTimestamp.
+ processSuccessfulLastOplogEntryFetcherResponse({makeOplogEntryObj(1)});
+
+ // Send an empty optime as the response to the beginFetchingOptime find request, which will
+ // cause the beginFetchingTimestamp to be set to the defaultBeginFetchingTimestamp.
+ auto request = net->scheduleSuccessfulResponse(
+ makeCursorResponse(0LL, NamespaceString::kSessionTransactionsTableNamespace, {}, true));
+ assertRemoteCommandNameEquals("find", request);
+ net->runReadyNetworkOperations();
+
+ // Oplog entry associated with the beginApplyingTimestamp.
+ processSuccessfulLastOplogEntryFetcherResponse({makeOplogEntryObj(1)});
+
+ {
+ // Ensure second lastOplogFetch doesn't happen until we're ready for it.
+ FailPointEnableBlock clonerFailpoint("hangAfterClonerStage",
+ kListDatabasesFailPointData);
+ // Feature Compatibility Version.
+ processSuccessfulFCVFetcherResponseLastStable();
+
+ // We do not have to respond to the OplogFetcher's oplog tailing query. Blackhole and
+ // move on to the AllDatabaseCloner's request.
+ auto noi = net->getNextReadyRequest();
+ request = assertRemoteCommandNameEquals("find", noi->getRequest());
+ ASSERT_TRUE(request.cmdObj.getBoolField("tailable"));
+ net->blackHole(noi);
+ }
+
+ // Oplog entry associated with the stopTimestamp.
+ request = assertRemoteCommandNameEquals(
+ "find",
+ net->scheduleErrorResponse(Status(ErrorCodes::HostUnreachable,
+ "Oplog entry fetcher associated with the "
+ "stopTimestamp failed with network error")));
+ ASSERT_TRUE(request.cmdObj.hasField("sort"));
+ ASSERT_EQUALS(1, request.cmdObj.getIntField("limit"));
+ net->runReadyNetworkOperations();
+ auto outageStart = net->now();
+
+ auto progress = initialSyncer->getInitialSyncProgress();
+ log() << "Progress after first network error: " << progress;
+ ASSERT_EQUALS(progress.getField("syncSourceUnreachableSince").date(), outageStart)
+ << progress;
+ ASSERT_EQUALS(progress.getIntField("currentOutageDurationMillis"), 0) << progress;
+ ASSERT_EQUALS(progress.getIntField("totalTimeUnreachableMillis"), 0) << progress;
+
+ // Advance the clock but not enough to terminate.
+ net->advanceTime(net->now() + Seconds(1));
+
+ // Retry fetch of oplog entry associated with the stopTimestamp.
+ request = assertRemoteCommandNameEquals(
+ "find",
+ net->scheduleErrorResponse(Status(ErrorCodes::HostUnreachable,
+ "Oplog entry fetcher associated with the "
+ "stopTimestamp failed with network error")));
+ ASSERT_TRUE(request.cmdObj.hasField("sort"));
+ ASSERT_EQUALS(1, request.cmdObj.getIntField("limit"));
+ net->runReadyNetworkOperations();
+
+ progress = initialSyncer->getInitialSyncProgress();
+ log() << "Progress after failed retry: " << progress;
+ ASSERT_EQUALS(progress.getField("syncSourceUnreachableSince").date(), outageStart)
+ << progress;
+ ASSERT_EQUALS(progress.getIntField("currentOutageDurationMillis"), 1000) << progress;
+ ASSERT_EQUALS(progress.getIntField("totalTimeUnreachableMillis"), 1000) << progress;
+
+ // Advance the clock but not enough to terminate.
+ net->advanceTime(net->now() + Seconds(1));
+
+ // Second retry succeeds.
+ processSuccessfulLastOplogEntryFetcherResponse({makeOplogEntryObj(1)});
+ net->runReadyNetworkOperations();
+
+ progress = initialSyncer->getInitialSyncProgress();
+ log() << "Progress after successful retry: " << progress;
+ ASSERT(progress.getField("syncSourceUnreachableSince").eoo());
+ ASSERT(progress.getField("currentOutageDurationMillis").eoo()) << progress;
+ ASSERT_EQUALS(progress.getIntField("totalTimeUnreachableMillis"), 2000) << progress;
+
+ // Last rollback checker replSetGetRBID command.
+ assertRemoteCommandNameEquals(
+ "replSetGetRBID", net->scheduleSuccessfulResponse(makeRollbackCheckerResponse(1)));
+ net->runReadyNetworkOperations();
+
+ // _lastOplogEntryFetcherCallbackAfterCloningData() will shut down the OplogFetcher after
+ // setting the completion status.
+ // We call runReadyNetworkOperations() again to deliver the cancellation status to
+ // _oplogFetcherCallback().
+ net->runReadyNetworkOperations();
+ }
+
+ initialSyncer->join();
+ ASSERT_OK(_lastApplied.getStatus());
+
+ auto progress = initialSyncer->getInitialSyncProgress();
+ log() << "Progress at end: " << progress;
+ ASSERT(progress.getField("syncSourceUnreachableSince").eoo());
+ ASSERT(progress.getField("currentOutageDurationMillis").eoo()) << progress;
+ ASSERT_EQUALS(progress.getIntField("totalTimeUnreachableMillis"), 2000) << progress;
+ BSONObj attempts = progress["initialSyncAttempts"].Obj();
+ ASSERT_EQUALS(attempts.nFields(), 1) << attempts;
+ BSONObj attempt0 = attempts["0"].Obj();
+ ASSERT_EQUALS(attempt0.getIntField("totalTimeUnreachableMillis"), 2000) << attempt0;
+ ASSERT_EQUALS(attempt0.getIntField("operationsRetried"), 2) << attempt0;
}
TEST_F(InitialSyncerTest, GetInitialSyncProgressOmitsClonerStatsIfClonerStatsExceedBsonLimit) {