diff options
author | Lingzhi Deng <lingzhi.deng@mongodb.com> | 2020-02-19 18:12:23 -0500 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-02-20 16:22:02 +0000 |
commit | f26fc48b099deb568cac3c2702cbc636991ead92 (patch) | |
tree | 7c42ed15d0eb00acbe449fb60bdc33a4c6237806 | |
parent | 31426dd86fd67c7f4a0a8d586158a6aad5e59f27 (diff) | |
download | mongo-f26fc48b099deb568cac3c2702cbc636991ead92.tar.gz |
SERVER-44710: Add metrics.repl.network.getmores.numEmptyBatches
-rw-r--r-- | jstests/replsets/server_status_metrics.js | 17 | ||||
-rw-r--r-- | src/mongo/db/repl/oplog_fetcher.cpp | 33 |
2 files changed, 46 insertions, 4 deletions
diff --git a/jstests/replsets/server_status_metrics.js b/jstests/replsets/server_status_metrics.js index acf2df103a4..a88bb03c80d 100644 --- a/jstests/replsets/server_status_metrics.js +++ b/jstests/replsets/server_status_metrics.js @@ -255,6 +255,12 @@ jsTestLog(`Secondary ${secondary.host} metrics before restarting replication: ${ // Enable periodic noops to aid sync source selection. assert.commandWorked(primary.adminCommand({setParameter: 1, writePeriodicNoops: true})); +// Enable the setSmallOplogGetMoreMaxTimeMS failpoint on secondary so that it will start using +// a small awaitData timeout for oplog fetching after re-choosing the sync source. This is needed to +// make sync source return empty batches more frequently in order to test the metric +// numEmptyBatches. +configureFailPoint(secondary, 'setSmallOplogGetMoreMaxTimeMS'); + // Repeatedly restart replication and wait for the sync source to be rechosen. If the sync source // gets set to empty between stopping and restarting replication, then the secondary won't // increment numTimesChoseSame, so we do this in a loop. @@ -279,6 +285,17 @@ assert.soon( assert.gt(ssNew.numSelections, ssOld.numSelections, "num selections not incremented"); assert.gt(ssNew.numTimesChoseSame, ssOld.numTimesChoseSame, "same sync source not chosen"); +// Get the base number of empty batches after the secondary is up to date. Assert that the secondary +// eventually gets an empty batch due to awaitData timeout. +rt.awaitLastOpCommitted(); +const targetNumEmptyBatches = + secondary.getDB("test").serverStatus().metrics.repl.network.getmores.numEmptyBatches + 1; +assert.soon( + () => secondary.getDB("test").serverStatus().metrics.repl.network.getmores.numEmptyBatches >= + targetNumEmptyBatches, + `Timed out waiting for numEmptyBatches reach ${targetNumEmptyBatches}, current ${ + secondary.getDB("test").serverStatus().metrics.repl.network.getmores.numEmptyBatches}`); + // Stop the primary so the secondary cannot choose a sync source. ssOld = ssNew; rt.stop(primary); diff --git a/src/mongo/db/repl/oplog_fetcher.cpp b/src/mongo/db/repl/oplog_fetcher.cpp index 94c212ce950..e02922be165 100644 --- a/src/mongo/db/repl/oplog_fetcher.cpp +++ b/src/mongo/db/repl/oplog_fetcher.cpp @@ -61,11 +61,36 @@ MONGO_FAIL_POINT_DEFINE(hangBeforeOplogFetcherRetries); MONGO_FAIL_POINT_DEFINE(hangBeforeProcessingSuccessfulBatch); namespace { +class OplogBatchStats { +public: + void recordMillis(int millis, bool isEmptyBatch); + BSONObj getReport() const; + operator BSONObj() const { + return getReport(); + } + +private: + TimerStats _getMores; + Counter64 _numEmptyBatches; +}; + +void OplogBatchStats::recordMillis(int millis, bool isEmptyBatch) { + _getMores.recordMillis(millis); + if (isEmptyBatch) { + _numEmptyBatches.increment(); + } +} + +BSONObj OplogBatchStats::getReport() const { + BSONObjBuilder b(_getMores.getReport()); + b.append("numEmptyBatches", _numEmptyBatches.get()); + return b.obj(); +} // The number and time spent reading batches off the network -TimerStats getmoreReplStats; -ServerStatusMetricField<TimerStats> displayBatchesRecieved("repl.network.getmores", - &getmoreReplStats); +OplogBatchStats oplogBatchStats; +ServerStatusMetricField<OplogBatchStats> displayBatchesRecieved("repl.network.getmores", + &oplogBatchStats); // The oplog entries read via the oplog reader Counter64 opsReadStats; ServerStatusMetricField<Counter64> displayOpsRead("repl.network.ops", &opsReadStats); @@ -876,7 +901,7 @@ Status OplogFetcher::_onSuccessfulBatch(const Documents& documents) { opsReadStats.increment(info.networkDocumentCount); networkByteStats.increment(info.networkDocumentBytes); - getmoreReplStats.recordMillis(_lastBatchElapsedMS); + oplogBatchStats.recordMillis(_lastBatchElapsedMS, documents.empty()); auto status = _enqueueDocumentsFn(firstDocToApply, documents.cend(), info); if (!status.isOK()) { |