diff options
author | Judah Schvimer <judah.schvimer@10gen.com> | 2019-10-11 17:39:49 +0000 |
---|---|---|
committer | evergreen <evergreen@mongodb.com> | 2019-10-11 17:39:49 +0000 |
commit | 9b3801e457c4952e36f2a13d45387d647c301e03 (patch) | |
tree | a617e2650e6a812355c859ebe5998e0294b38b42 | |
parent | 546e411b72cf6f75d24b304ce9219d1f3d3a4e4f (diff) | |
download | mongo-9b3801e457c4952e36f2a13d45387d647c301e03.tar.gz |
SERVER-34722 Add new server status metrics about oplog application
-rw-r--r-- | jstests/replsets/server_status_metrics.js | 75 | ||||
-rw-r--r-- | src/mongo/db/repl/bgsync.cpp | 32 | ||||
-rw-r--r-- | src/mongo/db/repl/reporter.cpp | 9 |
3 files changed, 110 insertions, 6 deletions
diff --git a/jstests/replsets/server_status_metrics.js b/jstests/replsets/server_status_metrics.js index e7784ba0c76..50e58380299 100644 --- a/jstests/replsets/server_status_metrics.js +++ b/jstests/replsets/server_status_metrics.js @@ -1,9 +1,16 @@ + +(function() { +"use strict"; + +load("jstests/libs/check_log.js"); +load("jstests/libs/write_concern_util.js"); + /** * Test replication metrics */ function testSecondaryMetrics(secondary, opCount, baseOpsApplied, baseOpsReceived) { var ss = secondary.getDB("test").serverStatus(); - printjson(ss.metrics); + jsTestLog(`Secondary ${secondary.host} metrics: ${tojson(ss.metrics)}`); assert(ss.metrics.repl.network.readersCreated > 0, "no (oplog) readers created"); assert(ss.metrics.repl.network.getmores.num > 0, "no getmores"); @@ -18,6 +25,11 @@ function testSecondaryMetrics(secondary, opCount, baseOpsApplied, baseOpsReceive ss.metrics.repl.network.ops, opCount + baseOpsApplied, "wrong number of ops retrieved"); assert(ss.metrics.repl.network.bytes > 0, "zero or missing network bytes"); + assert.gt( + ss.metrics.repl.network.replSetUpdatePosition.num, 0, "no update position commands sent"); + assert.gt(ss.metrics.repl.syncSource.numSelections, 0, "num selections not incremented"); + assert.gt(ss.metrics.repl.syncSource.numTimesChoseDifferent, 0, "no new sync source chosen"); + assert(ss.metrics.repl.buffer.count >= 0, "buffer count missing"); assert(ss.metrics.repl.buffer.sizeBytes >= 0, "size (bytes)] missing"); assert(ss.metrics.repl.buffer.maxSizeBytes >= 0, "maxSize (bytes) missing"); @@ -30,9 +42,16 @@ function testSecondaryMetrics(secondary, opCount, baseOpsApplied, baseOpsReceive assert.eq(ss.metrics.repl.apply.ops, opCount + baseOpsApplied, "wrong number of applied ops"); } -var rt = new ReplSetTest({name: "server_status_metrics", nodes: 2, oplogSize: 100}); +var rt = new ReplSetTest({ + name: "server_status_metrics", + nodes: 2, + oplogSize: 100, + // Write periodic noops to aid sync source selection. ReplSetTest.initiate() requires at least a + // 2 second noop writer interval to converge on a lastApplied optime. + nodeOptions: {setParameter: {writePeriodicNoops: true, periodicNoopIntervalSecs: 2}} +}); rt.startSet(); -rt.initiate(); +rt.initiateWithHighElectionTimeout(); rt.awaitSecondaryNodes(); @@ -52,7 +71,7 @@ var secondaryBaseOplogOpsReceived = ss.metrics.repl.apply.batchSize; // add test docs var bulk = testDB.a.initializeUnorderedBulkOp(); -for (x = 0; x < 1000; x++) { +for (let x = 0; x < 1000; x++) { bulk.insert({}); } assert.commandWorked(bulk.execute({w: 2})); @@ -68,7 +87,8 @@ testSecondaryMetrics(secondary, 2000, secondaryBaseOplogOpsApplied, secondaryBas var startMillis = testDB.serverStatus().metrics.getLastError.wtime.totalMillis; var startNum = testDB.serverStatus().metrics.getLastError.wtime.num; -printjson(primary.getDB("test").serverStatus().metrics); +jsTestLog( + `Primary ${primary.host} metrics #1: ${tojson(primary.getDB("test").serverStatus().metrics)}`); assert.commandWorked(testDB.a.insert({x: 1}, {writeConcern: {w: 1, wtimeout: 5000}})); assert.eq(testDB.serverStatus().metrics.getLastError.wtime.totalMillis, startMillis); @@ -86,6 +106,49 @@ assert.eq(testDB.serverStatus().metrics.getLastError.wtime.num, startNum + 1); assert.writeError(testDB.a.insert({x: 1}, {writeConcern: {w: 3, wtimeout: 50}})); assert.eq(testDB.serverStatus().metrics.getLastError.wtime.num, startNum + 2); -printjson(primary.getDB("test").serverStatus().metrics); +jsTestLog( + `Primary ${primary.host} metrics #2: ${tojson(primary.getDB("test").serverStatus().metrics)}`); + +let ssOld = secondary.getDB("test").serverStatus().metrics.repl.syncSource; +jsTestLog(`Secondary ${secondary.host} metrics before restarting replication: ${tojson(ssOld)}`); + +// Repeatedly restart replication and wait for the sync source to be rechosen. If the sync source +// gets set to empty between stopping and restarting replication, then the secondary won't +// increment numTimesChoseSame, so we do this in a loop. +let ssNew; +assert.soon( + function() { + stopServerReplication(secondary); + restartServerReplication(secondary); + + // Do a dummy write to choose a new sync source and replicate the write to block on that. + assert.commandWorked( + primary.getDB("test").bar.insert({"dummy_write": 3}, {writeConcern: {w: 2}})); + ssNew = secondary.getDB("test").serverStatus().metrics.repl.syncSource; + jsTestLog( + `Secondary ${secondary.host} metrics after restarting replication: ${tojson(ssNew)}`); + return ssNew.numTimesChoseSame > ssOld.numTimesChoseSame; + }, + "timed out waiting to re-choose same sync source", + null, + 3 * 1000 /* 3sec interval to wait for noop */); + +assert.gt(ssNew.numSelections, ssOld.numSelections, "num selections not incremented"); +assert.gt(ssNew.numTimesChoseSame, ssOld.numTimesChoseSame, "same sync source not chosen"); + +// Stop the primary so the secondary cannot choose a sync source. +ssOld = ssNew; +rt.stop(primary); +stopServerReplication(secondary); +restartServerReplication(secondary); +assert.soon(function() { + ssNew = secondary.getDB("test").serverStatus().metrics.repl.syncSource; + jsTestLog(`Secondary ${secondary.host} metrics after stopping primary: ${tojson(ssNew)}`); + return ssNew.numTimesCouldNotFind > ssOld.numTimesCouldNotFind; +}); + +assert.gt(ssNew.numSelections, ssOld.numSelections, "num selections not incremented"); +assert.gt(ssNew.numTimesCouldNotFind, ssOld.numTimesCouldNotFind, "found new sync source"); rt.stopSet(); +})(); diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp index 8b6a0d4221c..497a0fefca9 100644 --- a/src/mongo/db/repl/bgsync.cpp +++ b/src/mongo/db/repl/bgsync.cpp @@ -74,6 +74,31 @@ const int kSleepToAllowBatchingMillis = 2; const int kSmallBatchLimitBytes = 40000; const Milliseconds kRollbackOplogSocketTimeout(10 * 60 * 1000); +// The number of times a node attempted to choose a node to sync from among the available sync +// source options. This occurs if we re-evaluate our sync source, receive an error from the source, +// or step down. +Counter64 numSyncSourceSelections; +ServerStatusMetricField<Counter64> displayNumSyncSourceSelections("repl.syncSource.numSelections", + &numSyncSourceSelections); + +// The number of times a node kept it's original sync source after re-evaluating if its current sync +// source was optimal. +Counter64 numTimesChoseSameSyncSource; +ServerStatusMetricField<Counter64> displayNumTimesChoseSameSyncSource( + "repl.syncSource.numTimesChoseSame", &numTimesChoseSameSyncSource); + +// The number of times a node chose a new sync source after re-evaluating if its current sync source +// was optimal. +Counter64 numTimesChoseDifferentSyncSource; +ServerStatusMetricField<Counter64> displayNumTimesChoseDifferentSyncSource( + "repl.syncSource.numTimesChoseDifferent", &numTimesChoseDifferentSyncSource); + +// The number of times a node could not find a sync source when choosing a node to sync from among +// the available options. +Counter64 numTimesCouldNotFindSyncSource; +ServerStatusMetricField<Counter64> displayNumTimesCouldNotFindSyncSource( + "repl.syncSource.numTimesCouldNotFind", &numTimesCouldNotFindSyncSource); + /** * Extends DataReplicatorExternalStateImpl to be member state aware. */ @@ -298,6 +323,8 @@ void BackgroundSync::_produce() { _syncSourceResolver.reset(); } + numSyncSourceSelections.increment(1); + if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) { // All (accessible) sync sources are too far ahead of us. if (_replCoord->getMemberState().primary()) { @@ -351,11 +378,13 @@ void BackgroundSync::_produce() { log() << "Chose same sync source candidate as last time, " << source << ". Sleeping for 1 second to avoid immediately choosing a new sync source for " "the same reason as last time."; + numTimesChoseSameSyncSource.increment(1); sleepsecs(1); } else { log() << "Changed sync source from " << (oldSource.empty() ? std::string("empty") : oldSource.toString()) << " to " << source; + numTimesChoseDifferentSyncSource.increment(1); } } else { if (!syncSourceResp.isOK()) { @@ -363,6 +392,9 @@ void BackgroundSync::_produce() { << syncSourceResp.syncSourceStatus.getStatus(); } // No sync source found. + LOG(1) << "Could not find a sync source. Sleeping for 1 second before trying again."; + numTimesCouldNotFindSyncSource.increment(1); + sleepsecs(1); return; } diff --git a/src/mongo/db/repl/reporter.cpp b/src/mongo/db/repl/reporter.cpp index d659ee83965..f66eb930893 100644 --- a/src/mongo/db/repl/reporter.cpp +++ b/src/mongo/db/repl/reporter.cpp @@ -33,7 +33,9 @@ #include "mongo/db/repl/reporter.h" +#include "mongo/base/counter.h" #include "mongo/bson/util/bson_extract.h" +#include "mongo/db/commands/server_status_metric.h" #include "mongo/db/repl/update_position_args.h" #include "mongo/rpc/get_status_from_command_result.h" #include "mongo/util/assert_util.h" @@ -47,6 +49,11 @@ namespace { const char kConfigVersionFieldName[] = "configVersion"; +// The number of replSetUpdatePosition commands a node sent to its sync source. +Counter64 numUpdatePosition; +ServerStatusMetricField<Counter64> displayNumUpdatePosition( + "repl.network.replSetUpdatePosition.num", &numUpdatePosition); + /** * Returns configuration version in update command object. * Returns -1 on failure. @@ -233,6 +240,8 @@ void Reporter::_sendCommand_inlock(BSONObj commandRequest, Milliseconds netTimeo return; } + numUpdatePosition.increment(1); + _remoteCommandCallbackHandle = scheduleResult.getValue(); } |