diff options
author | Maria van Keulen <maria.vankeulen@mongodb.com> | 2019-10-10 22:05:51 +0000 |
---|---|---|
committer | evergreen <evergreen@mongodb.com> | 2019-10-10 22:05:51 +0000 |
commit | dd819eb95636f47f13638259208ae8a69e48ded7 (patch) | |
tree | ce42f99d086105373cf2df7c74740a910279fad2 | |
parent | de724d3d8a82667ac9da97c39abf0b9d9728cba4 (diff) | |
download | mongo-dd819eb95636f47f13638259208ae8a69e48ded7.tar.gz |
SERVER-43322 Add tunable oplog stone sizes and track truncation speeds
(cherry picked from commit 294a8f68615710b47936d5ee42439d01538ac746)
SERVER-43322 Disallow oplog_sampling.js on inMemory storage enigne
(cherry picked from commit b35dd89515473c97a87b3c06897e8a7ab51c93cc)
This backport was not a straightforward cherry-pick due to SERVER-40168
existing in master and not v4.2.
10 files changed, 215 insertions, 9 deletions
diff --git a/jstests/replsets/oplog_rollover.js b/jstests/replsets/oplog_rollover.js index b9a08bbb80e..5199a1802fc 100644 --- a/jstests/replsets/oplog_rollover.js +++ b/jstests/replsets/oplog_rollover.js @@ -104,6 +104,14 @@ function doTest(storageEngine) { assert.soon(() => { return numInsertOplogEntry(secondaryOplog) === 2; }, "Timeout waiting for oplog to roll over on secondary"); + + if (jsTest.options().storageEngine == "wiredTiger") { + const res = primary.getDB("test").runCommand({serverStatus: 1}); + assert.commandWorked(res); + assert.eq(res.oplogTruncation.truncateCount, 1, tojson(res.oplogTruncation)); + assert.gt( + res.oplogTruncation.totalTimeTruncatingMicros, 0, tojson(res.oplogTruncation)); + } } else { // Only test that oplog truncation will eventually happen. let numInserted = 2; diff --git a/jstests/replsets/oplog_sampling.js b/jstests/replsets/oplog_sampling.js new file mode 100644 index 00000000000..3407b1826c9 --- /dev/null +++ b/jstests/replsets/oplog_sampling.js @@ -0,0 +1,41 @@ +/** + * Ensure serverStatus reports the total time spent sampling the oplog for all storage engines that + * support OplogStones. + * @tags: [ requires_wiredtiger, requires_persistence ] + */ +(function() { +"use strict"; + +// Force oplog sampling to occur on start up for small numbers of oplog inserts. +const replSet = new ReplSetTest( + {nodes: 1, nodeOptions: {setParameter: {"maxOplogTruncationPointsDuringStartup": 10}}}); +replSet.startSet(); +replSet.initiate(); + +let coll = replSet.getPrimary().getDB("test").getCollection("testcoll"); + +let res = replSet.getPrimary().getDB("test").serverStatus(); +assert.commandWorked(res); + +// Small (or empty) oplogs should be processed by scanning. +assert.gt(res.oplogTruncation.totalTimeProcessingMicros, 0); +assert.eq(res.oplogTruncation.processingMethod, "scanning"); + +// Insert enough documents to force oplog sampling to occur on the following start up. +const maxOplogDocsForScanning = 2000; +for (let i = 0; i < maxOplogDocsForScanning + 1; i++) { + assert.commandWorked(coll.insert({m: 1 + i})); +} + +// Restart replica set to load entries from the oplog for sampling. +replSet.stopSet(null /* signal */, true /* forRestart */); +replSet.startSet({restart: true}); + +res = replSet.getPrimary().getDB("test").serverStatus(); +assert.commandWorked(res); + +assert.gt(res.oplogTruncation.totalTimeProcessingMicros, 0); +assert.eq(res.oplogTruncation.processingMethod, "sampling"); + +replSet.stopSet(); +})(); diff --git a/src/mongo/db/storage/wiredtiger/SConscript b/src/mongo/db/storage/wiredtiger/SConscript index 4434fea01bd..33dd1b9c934 100644 --- a/src/mongo/db/storage/wiredtiger/SConscript +++ b/src/mongo/db/storage/wiredtiger/SConscript @@ -83,6 +83,9 @@ if wiredtiger: 'storage_wiredtiger_customization_hooks', ], LIBDEPS_PRIVATE= [ + 'oplog_stone_parameters', + '$BUILD_DIR/mongo/db/db_raii', + '$BUILD_DIR/mongo/db/commands/server_status', '$BUILD_DIR/mongo/db/snapshot_window_options', '$BUILD_DIR/mongo/db/storage/storage_repair_observer', '$BUILD_DIR/mongo/util/options_parser/options_parser', @@ -183,6 +186,16 @@ if wiredtiger: ], ) + wtEnv.Library( + target='oplog_stone_parameters', + source=[ + env.Idlc('oplog_stone_parameters.idl')[0], + ], + LIBDEPS=[ + '$BUILD_DIR/mongo/idl/server_parameter', + ], + ) + # All of these tests fail to compile under undefined behavior # sanitizer due to unexpressed circular dependency edges. In particular # they all need a definition from the 'catalog'. diff --git a/src/mongo/db/storage/wiredtiger/oplog_stone_parameters.idl b/src/mongo/db/storage/wiredtiger/oplog_stone_parameters.idl new file mode 100644 index 00000000000..4737f234bd4 --- /dev/null +++ b/src/mongo/db/storage/wiredtiger/oplog_stone_parameters.idl @@ -0,0 +1,59 @@ +# Copyright (C) 2019-present MongoDB, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the Server Side Public License, version 1, +# as published by MongoDB, Inc. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Server Side Public License for more details. +# +# You should have received a copy of the Server Side Public License +# along with this program. If not, see +# <http://www.mongodb.com/licensing/server-side-public-license>. +# +# As a special exception, the copyright holders give permission to link the +# code of portions of this program with the OpenSSL library under certain +# conditions as described in each individual source file and distribute +# linked combinations including the program with the OpenSSL library. You +# must comply with the Server Side Public License in all respects for +# all of the code used other than as permitted herein. If you modify file(s) +# with this exception, you may extend this exception to your version of the +# file(s), but you are not obligated to do so. If you do not wish to do so, +# delete this exception statement from your version. If you delete this +# exception statement from all source files in the program, then also delete +# it in the license file. +# +global: + cpp_namespace: "mongo" + +server_parameters: + maxOplogTruncationPointsAfterStartup: + description: 'Maximum allowable number of oplog truncation points after startup has finished' + set_at: [ startup ] + cpp_vartype: 'long long' + cpp_varname: gMaxOplogStonesAfterStartup + default: 100 + validator: { gt: 0 } + maxOplogTruncationPointsDuringStartup: + description: 'Maximum allowable number of oplog truncation points during startup' + set_at: [ startup ] + cpp_vartype: 'long long' + cpp_varname: gMaxOplogStonesDuringStartup + default: 100 + validator: { gt: 0 } + minOplogTruncationPoints: + description: 'Minimum allowable number of oplog truncation points' + set_at: [ startup ] + cpp_vartype: 'long long' + cpp_varname: gMinOplogStones + default: 10 + validator: { gt: 0 } + oplogTruncationPointSizeMB: + description: 'Oplog truncation point size in MB used to determine the number of oplog truncation points for an oplog of a given size. The size will be rounded up to the maximum size of an internal BSON object.' + set_at: [ startup ] + cpp_vartype: 'int' + cpp_varname: gOplogStoneSizeMB + default: 0 + validator: { gte: 0 } diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_init.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_init.cpp index b01e796f1de..3a06df1682b 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_init.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_init.cpp @@ -118,6 +118,7 @@ public: kv->setSortedDataInterfaceExtraOptions(wiredTigerGlobalOptions.indexConfig); // Intentionally leaked. new WiredTigerServerStatusSection(kv); + new OplogStonesServerStatusSection(); auto* param = new WiredTigerEngineRuntimeConfigParameter("wiredTigerEngineRuntimeConfig", ServerParameterType::kRuntimeOnly); param->_data.second = kv; diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp index 16358b35029..d04a8edf30a 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp @@ -50,6 +50,7 @@ #include "mongo/db/server_recovery.h" #include "mongo/db/service_context.h" #include "mongo/db/storage/oplog_hack.h" +#include "mongo/db/storage/wiredtiger/oplog_stone_parameters_gen.h" #include "mongo/db/storage/wiredtiger/wiredtiger_customization_hooks.h" #include "mongo/db/storage/wiredtiger/wiredtiger_global_options.h" #include "mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h" @@ -164,10 +165,16 @@ WiredTigerRecordStore::OplogStones::OplogStones(OperationContext* opCtx, WiredTi invariant(rs->cappedMaxSize() > 0); unsigned long long maxSize = rs->cappedMaxSize(); - const unsigned long long kMinStonesToKeep = 10ULL; - const unsigned long long kMaxStonesToKeep = 100ULL; + // The minimum oplog stone size should be BSONObjMaxInternalSize. + const unsigned int oplogStoneSize = + std::max(gOplogStoneSizeMB * 1024 * 1024, BSONObjMaxInternalSize); - unsigned long long numStones = maxSize / BSONObjMaxInternalSize; + // IDL does not support unsigned long long types. + const unsigned long long kMinStonesToKeep = static_cast<unsigned long long>(gMinOplogStones); + const unsigned long long kMaxStonesToKeep = + static_cast<unsigned long long>(gMaxOplogStonesDuringStartup); + + unsigned long long numStones = maxSize / oplogStoneSize; size_t numStonesToKeep = std::min(kMaxStonesToKeep, std::max(kMinStonesToKeep, numStones)); _minBytesPerStone = maxSize / numStonesToKeep; invariant(_minBytesPerStone > 0); @@ -313,6 +320,12 @@ void WiredTigerRecordStore::OplogStones::setMinBytesPerStone(int64_t size) { void WiredTigerRecordStore::OplogStones::_calculateStones(OperationContext* opCtx, size_t numStonesToKeep) { + const std::uint64_t startWaitTime = curTimeMicros64(); + ON_BLOCK_EXIT([&] { + auto waitTime = curTimeMicros64() - startWaitTime; + log() << "WiredTiger record store oplog processing took " << waitTime / 1000 << "ms"; + _totalTimeProcessing.fetchAndAdd(waitTime); + }); long long numRecords = _rs->numRecords(opCtx); long long dataSize = _rs->dataSize(opCtx); @@ -342,6 +355,7 @@ void WiredTigerRecordStore::OplogStones::_calculateStones(OperationContext* opCt } void WiredTigerRecordStore::OplogStones::_calculateStonesByScanning(OperationContext* opCtx) { + _processBySampling.store(false); // process by scanning log() << "Scanning the oplog to determine where to place markers for truncation"; long long numRecords = 0; @@ -369,6 +383,8 @@ void WiredTigerRecordStore::OplogStones::_calculateStonesByScanning(OperationCon void WiredTigerRecordStore::OplogStones::_calculateStonesBySampling(OperationContext* opCtx, int64_t estRecordsPerStone, int64_t estBytesPerStone) { + log() << "Sampling the oplog to determine where to place markers for truncation"; + _processBySampling.store(true); // process by sampling Timestamp earliestOpTime; Timestamp latestOpTime; @@ -457,10 +473,16 @@ void WiredTigerRecordStore::OplogStones::_pokeReclaimThreadIfNeeded() { void WiredTigerRecordStore::OplogStones::adjust(int64_t maxSize) { stdx::lock_guard<stdx::mutex> lk(_mutex); - const unsigned long long kMinStonesToKeep = 10ULL; - const unsigned long long kMaxStonesToKeep = 100ULL; - unsigned long long numStones = maxSize / BSONObjMaxInternalSize; + const unsigned int oplogStoneSize = + std::max(gOplogStoneSizeMB * 1024 * 1024, BSONObjMaxInternalSize); + + // IDL does not support unsigned long long types. + const unsigned long long kMinStonesToKeep = static_cast<unsigned long long>(gMinOplogStones); + const unsigned long long kMaxStonesToKeep = + static_cast<unsigned long long>(gMaxOplogStonesAfterStartup); + + unsigned long long numStones = maxSize / oplogStoneSize; size_t numStonesToKeep = std::min(kMaxStonesToKeep, std::max(kMinStonesToKeep, numStones)); _minBytesPerStone = maxSize / numStonesToKeep; invariant(_minBytesPerStone > 0); @@ -763,6 +785,14 @@ void WiredTigerRecordStore::postConstructorInit(OperationContext* opCtx) { } } +void WiredTigerRecordStore::getOplogTruncateStats(BSONObjBuilder& builder) const { + if (_oplogStones) { + _oplogStones->getOplogStonesStats(builder); + } + builder.append("totalTimeTruncatingMicros", _totalTimeTruncating.load()); + builder.append("truncateCount", _truncateCount.load()); +} + const char* WiredTigerRecordStore::name() const { return _engineName.c_str(); } @@ -1220,7 +1250,11 @@ void WiredTigerRecordStore::reclaimOplog(OperationContext* opCtx, Timestamp mayT LOG(1) << "Finished truncating the oplog, it now contains approximately " << _sizeInfo->numRecords.load() << " records totaling to " << _sizeInfo->dataSize.load() << " bytes"; - log() << "WiredTiger record store oplog truncation finished in: " << timer.millis() << "ms"; + auto elapsedMicros = timer.micros(); + auto elapsedMillis = elapsedMicros / 1000; + _totalTimeTruncating.fetchAndAdd(elapsedMicros); + _truncateCount.fetchAndAdd(1); + log() << "WiredTiger record store oplog truncation finished in: " << elapsedMillis << "ms"; } Status WiredTigerRecordStore::insertRecords(OperationContext* opCtx, diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h index a3d5870a489..9737bd700a3 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h @@ -117,6 +117,8 @@ public: WiredTigerRecordStore(WiredTigerKVEngine* kvEngine, OperationContext* opCtx, Params params); + void getOplogTruncateStats(BSONObjBuilder& builder) const; + virtual ~WiredTigerRecordStore(); virtual void postConstructorInit(OperationContext* opCtx); @@ -383,6 +385,10 @@ private: // Non-null if this record store is underlying the active oplog. std::shared_ptr<OplogStones> _oplogStones; + + AtomicWord<int64_t> + _totalTimeTruncating; // Cumulative amount of time spent truncating the oplog. + AtomicWord<int64_t> _truncateCount; // Cumulative number of truncates of the oplog. }; diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_oplog_stones.h b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_oplog_stones.h index f6e9371c894..99697caac08 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_oplog_stones.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_oplog_stones.h @@ -69,6 +69,11 @@ public: void awaitHasExcessStonesOrDead(); + void getOplogStonesStats(BSONObjBuilder& builder) const { + builder.append("totalTimeProcessingMicros", _totalTimeProcessing.load()); + builder.append("processingMethod", _processBySampling.load() ? "sampling" : "scanning"); + } + boost::optional<OplogStones::Stone> peekOldestStoneIfNeeded() const; void popOldestStone(); @@ -140,8 +145,11 @@ private: // deque of oplog stones. int64_t _minBytesPerStone; - AtomicWord<long long> _currentRecords; // Number of records in the stone being filled. - AtomicWord<long long> _currentBytes; // Number of bytes in the stone being filled. + AtomicWord<long long> _currentRecords; // Number of records in the stone being filled. + AtomicWord<long long> _currentBytes; // Number of bytes in the stone being filled. + AtomicWord<int64_t> _totalTimeProcessing; // Amount of time spent scanning and/or sampling the + // oplog during start up, if any. + AtomicWord<bool> _processBySampling; // Whether the oplog was sampled or scanned. mutable stdx::mutex _mutex; // Protects against concurrent access to the deque of oplog stones. std::deque<OplogStones::Stone> _stones; // front = oldest, back = newest. diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_server_status.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_server_status.cpp index 335aebd34a5..100924d9e98 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_server_status.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_server_status.cpp @@ -34,6 +34,8 @@ #include "mongo/base/checked_cast.h" #include "mongo/bson/bsonobjbuilder.h" #include "mongo/db/concurrency/d_concurrency.h" +#include "mongo/db/db_raii.h" +#include "mongo/db/namespace_string.h" #include "mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h" #include "mongo/db/storage/wiredtiger/wiredtiger_record_store.h" #include "mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h" @@ -41,6 +43,7 @@ #include "mongo/db/storage/wiredtiger/wiredtiger_util.h" #include "mongo/util/assert_util.h" + namespace mongo { using std::string; @@ -85,4 +88,26 @@ BSONObj WiredTigerServerStatusSection::generateSection(OperationContext* opCtx, return bob.obj(); } +OplogStonesServerStatusSection::OplogStonesServerStatusSection() + : ServerStatusSection("oplogTruncation") {} + +bool OplogStonesServerStatusSection::includeByDefault() const { + return true; +} + +BSONObj OplogStonesServerStatusSection::generateSection(OperationContext* opCtx, + const BSONElement& configElement) const { + BSONObjBuilder builder; + { + AutoGetCollectionForReadCommand ctx(opCtx, NamespaceString::kRsOplogNamespace); + Collection* oplogColl = ctx.getCollection(); + if (oplogColl) { + auto oplogRS = checked_cast<WiredTigerRecordStore*>(oplogColl->getRecordStore()); + oplogRS->getOplogTruncateStats(builder); + } + } + return builder.obj(); +} + + } // namespace mongo diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_server_status.h b/src/mongo/db/storage/wiredtiger/wiredtiger_server_status.h index 9bea39b3398..1bcf68fff20 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_server_status.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_server_status.h @@ -49,4 +49,15 @@ private: WiredTigerKVEngine* _engine; }; +/** + * Adds oplog stones statistics to the results of db.serverStatus(). + */ +class OplogStonesServerStatusSection : public ServerStatusSection { +public: + OplogStonesServerStatusSection(); + bool includeByDefault() const override; + BSONObj generateSection(OperationContext* opCtx, + const BSONElement& configElement) const override; +}; + } // namespace mongo |