diff options
author | wolfkdy <kdy71107216@aliyun.com> | 2017-03-04 16:10:58 +0800 |
---|---|---|
committer | Geert Bosch <geert@mongodb.com> | 2017-07-05 17:44:39 -0400 |
commit | 93beb0234eba9dc58ab6070ad472022f96e019e6 (patch) | |
tree | 2187cdfef5e3b602fb736d175677e4eb7db562f5 /src/mongo/db/storage | |
parent | 2aefd80d1acea065c77bd3bd69abf686a27ae3e0 (diff) | |
download | mongo-93beb0234eba9dc58ab6070ad472022f96e019e6.tar.gz |
SERVER-22766 wiredtiger engine support update oplogsize online
Diffstat (limited to 'src/mongo/db/storage')
9 files changed, 93 insertions, 84 deletions
diff --git a/src/mongo/db/storage/kv/kv_collection_catalog_entry.cpp b/src/mongo/db/storage/kv/kv_collection_catalog_entry.cpp index e96cddbb1a7..c775ada1a23 100644 --- a/src/mongo/db/storage/kv/kv_collection_catalog_entry.cpp +++ b/src/mongo/db/storage/kv/kv_collection_catalog_entry.cpp @@ -248,6 +248,12 @@ void KVCollectionCatalogEntry::updateValidator(OperationContext* opCtx, _catalog->putMetaData(opCtx, ns().toString(), md); } +void KVCollectionCatalogEntry::updateCappedSize(OperationContext* opCtx, long long size) { + MetaData md = _getMetaData(opCtx); + md.options.cappedSize = size; + _catalog->putMetaData(opCtx, ns().toString(), md); +} + BSONCollectionCatalogEntry::MetaData KVCollectionCatalogEntry::_getMetaData( OperationContext* opCtx) const { return _catalog->getMetaData(opCtx, ns().toString()); diff --git a/src/mongo/db/storage/kv/kv_collection_catalog_entry.h b/src/mongo/db/storage/kv/kv_collection_catalog_entry.h index c999bee1fe5..21f31691739 100644 --- a/src/mongo/db/storage/kv/kv_collection_catalog_entry.h +++ b/src/mongo/db/storage/kv/kv_collection_catalog_entry.h @@ -78,6 +78,8 @@ public: StringData validationLevel, StringData validationAction) final; + void updateCappedSize(OperationContext*, long long int) final; + RecordStore* getRecordStore() { return _recordStore.get(); } diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp index 617a6739cfe..b37ca245298 100644 --- a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp +++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp @@ -442,4 +442,9 @@ void NamespaceDetailsCollectionCatalogEntry::setNamespacesRecordId(OperationCont _namespacesRecordId = newId; } } + +void NamespaceDetailsCollectionCatalogEntry::updateCappedSize(OperationContext* opCtx, + long long size) { + invariant(false); +} } diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h index f4e4410e317..aff16b22093 100644 --- a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h +++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h @@ -102,6 +102,8 @@ public: StringData validationLevel, StringData validationAction) final; + void updateCappedSize(OperationContext* opCtx, long long size) final; + // not part of interface, but available to my storage engine int _findIndexNumber(OperationContext* opCtx, StringData indexName) const; diff --git a/src/mongo/db/storage/record_store.h b/src/mongo/db/storage/record_store.h index a6c2c0d9a09..8f7dbe7919a 100644 --- a/src/mongo/db/storage/record_store.h +++ b/src/mongo/db/storage/record_store.h @@ -623,6 +623,14 @@ public: long long numRecords, long long dataSize) = 0; + /** + * used to support online change oplog size. + */ + virtual Status updateCappedSize(OperationContext* opCtx, long long cappedSize) { + return Status(ErrorCodes::CommandNotSupported, + "this storage engine does not support updateCappedSize"); + } + protected: std::string _ns; }; diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp index 38da23da215..0f28ef6a71f 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp @@ -155,11 +155,11 @@ WiredTigerRecordStore::OplogStones::OplogStones(OperationContext* opCtx, WiredTi const unsigned long long kMaxStonesToKeep = 100ULL; unsigned long long numStones = maxSize / BSONObjMaxInternalSize; - _numStonesToKeep = std::min(kMaxStonesToKeep, std::max(kMinStonesToKeep, numStones)); - _minBytesPerStone = maxSize / _numStonesToKeep; + size_t numStonesToKeep = std::min(kMaxStonesToKeep, std::max(kMinStonesToKeep, numStones)); + _minBytesPerStone = maxSize / numStonesToKeep; invariant(_minBytesPerStone > 0); - _calculateStones(opCtx); + _calculateStones(opCtx, numStonesToKeep); _pokeReclaimThreadIfNeeded(); // Reclaim stones if over the limit. } @@ -179,8 +179,14 @@ void WiredTigerRecordStore::OplogStones::kill() { void WiredTigerRecordStore::OplogStones::awaitHasExcessStonesOrDead() { // Wait until kill() is called or there are too many oplog stones. stdx::unique_lock<stdx::mutex> lock(_oplogReclaimMutex); - while (!_isDead && !hasExcessStones()) { - MONGO_IDLE_THREAD_BLOCK; + while (!_isDead) { + { + MONGO_IDLE_THREAD_BLOCK; + stdx::lock_guard<stdx::mutex> lk(_mutex); + if (hasExcessStones_inlock()) { + break; + } + } _oplogReclaimCv.wait(lock); } } @@ -189,7 +195,7 @@ boost::optional<WiredTigerRecordStore::OplogStones::Stone> WiredTigerRecordStore::OplogStones::peekOldestStoneIfNeeded() const { stdx::lock_guard<stdx::mutex> lk(_mutex); - if (!hasExcessStones()) { + if (!hasExcessStones_inlock()) { return {}; } @@ -221,6 +227,7 @@ void WiredTigerRecordStore::OplogStones::createNewStoneIfNeeded(RecordId lastRec return; } + LOG(2) << "create new oplogStone, current stones:" << _stones.size(); OplogStones::Stone stone = {_currentRecords.swap(0), _currentBytes.swap(0), lastRecord}; _stones.push_back(stone); @@ -279,17 +286,8 @@ void WiredTigerRecordStore::OplogStones::setMinBytesPerStone(int64_t size) { _minBytesPerStone = size; } -void WiredTigerRecordStore::OplogStones::setNumStonesToKeep(size_t numStones) { - invariant(numStones > 0); - - stdx::lock_guard<stdx::mutex> lk(_mutex); - - // Only allow changing the number of stones to keep if no data has been inserted. - invariant(_stones.size() == 0 && _currentRecords.load() == 0); - _numStonesToKeep = numStones; -} - -void WiredTigerRecordStore::OplogStones::_calculateStones(OperationContext* opCtx) { +void WiredTigerRecordStore::OplogStones::_calculateStones(OperationContext* opCtx, + size_t numStonesToKeep) { long long numRecords = _rs->numRecords(opCtx); long long dataSize = _rs->dataSize(opCtx); @@ -304,7 +302,7 @@ void WiredTigerRecordStore::OplogStones::_calculateStones(OperationContext* opCt // oplog to determine where to put down stones. if (numRecords <= 0 || dataSize <= 0 || uint64_t(numRecords) < - kMinSampleRatioForRandCursor * kRandomSamplesPerStone * _numStonesToKeep) { + kMinSampleRatioForRandCursor * kRandomSamplesPerStone * numStonesToKeep) { _calculateStonesByScanning(opCtx); return; } @@ -427,11 +425,23 @@ void WiredTigerRecordStore::OplogStones::_calculateStonesBySampling(OperationCon } void WiredTigerRecordStore::OplogStones::_pokeReclaimThreadIfNeeded() { - if (hasExcessStones()) { + if (hasExcessStones_inlock()) { _oplogReclaimCv.notify_one(); } } +void WiredTigerRecordStore::OplogStones::adjust(int64_t maxSize) { + stdx::lock_guard<stdx::mutex> lk(_mutex); + const unsigned long long kMinStonesToKeep = 10ULL; + const unsigned long long kMaxStonesToKeep = 100ULL; + + unsigned long long numStones = maxSize / BSONObjMaxInternalSize; + size_t numStonesToKeep = std::min(kMaxStonesToKeep, std::max(kMinStonesToKeep, numStones)); + _minBytesPerStone = maxSize / numStonesToKeep; + invariant(_minBytesPerStone > 0); + _pokeReclaimThreadIfNeeded(); +} + StatusWith<std::string> WiredTigerRecordStore::parseOptionsField(const BSONObj options) { StringBuilder ss; BSONForEach(elem, options) { @@ -2098,4 +2108,15 @@ void WiredTigerRecordStorePrefixedCursor::initCursorToBeginning() { } } +Status WiredTigerRecordStore::updateCappedSize(OperationContext* opCtx, long long cappedSize) { + if (_cappedMaxSize == cappedSize) { + return Status::OK(); + } + _cappedMaxSize = cappedSize; + if (_oplogStones) { + _oplogStones->adjust(cappedSize); + } + return Status::OK(); +} + } // namespace mongo diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h index 726febce42c..aca92fbea1d 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h @@ -218,6 +218,8 @@ public: void waitForAllEarlierOplogWritesToBeVisible(OperationContext* opCtx) const override; + Status updateCappedSize(OperationContext* opCtx, long long cappedSize) final; + bool isOplog() const { return _isOplog; } @@ -317,7 +319,7 @@ private: const bool _isEphemeral; // True if the namespace of this record store starts with "local.oplog.", and false otherwise. const bool _isOplog; - const int64_t _cappedMaxSize; + int64_t _cappedMaxSize; const int64_t _cappedMaxSizeSlack; // when to start applying backpressure const int64_t _cappedMaxDocs; RecordId _cappedFirstRecord; diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_oplog_stones.h b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_oplog_stones.h index f42ade90db8..dfd3dddb902 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_oplog_stones.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_oplog_stones.h @@ -57,8 +57,14 @@ public: void kill(); - bool hasExcessStones() const { - return _stones.size() > _numStonesToKeep; + bool hasExcessStones_inlock() const { + int64_t total_bytes = 0; + for (std::deque<OplogStones::Stone>::const_iterator it = _stones.begin(); + it != _stones.end(); + ++it) { + total_bytes += it->bytes; + } + return total_bytes > _rs->cappedMaxSize(); } void awaitHasExcessStonesOrDead(); @@ -81,6 +87,9 @@ public: int64_t bytesRemoved, RecordId firstRemovedId); + // Resize oplog size + void adjust(int64_t maxSize); + // The start point of where to truncate next. Used by the background reclaim thread to // efficiently truncate records with WiredTiger by skipping over tombstones, etc. RecordId firstRecord; @@ -104,13 +113,11 @@ public: void setMinBytesPerStone(int64_t size); - void setNumStonesToKeep(size_t numStones); - private: class InsertChange; class TruncateChange; - void _calculateStones(OperationContext* opCtx); + void _calculateStones(OperationContext* opCtx, size_t size); void _calculateStonesByScanning(OperationContext* opCtx); void _calculateStonesBySampling(OperationContext* opCtx, int64_t estRecordsPerStone, @@ -129,12 +136,8 @@ private: // database, and false otherwise. bool _isDead = false; - // Maximum number of stones to keep in the deque before the background reclaim thread should - // truncate the oldest ones. Does not include the stone currently being filled. This value - // should not be changed after initialization. - size_t _numStonesToKeep; // Minimum number of bytes the stone being filled should contain before it gets added to the - // deque of oplog stones. This value should not be changed after initialization. + // deque of oplog stones. int64_t _minBytesPerStone; AtomicInt64 _currentRecords; // Number of records in the stone being filled. diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_test.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_test.cpp index 0d15a514950..75b5c11e673 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_test.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_test.cpp @@ -1034,7 +1034,7 @@ TEST(WiredTigerRecordStoreTest, OplogStones_CappedTruncateAfter) { } } -// Verify that oplog stones are reclaimed when the number of stones to keep is exceeded. +// Verify that oplog stones are reclaimed when cappedMaxSize is exceeded. TEST(WiredTigerRecordStoreTest, OplogStones_ReclaimStones) { std::unique_ptr<RecordStoreHarnessHelper> harnessHelper = newRecordStoreHarnessHelper(); @@ -1045,8 +1045,12 @@ TEST(WiredTigerRecordStoreTest, OplogStones_ReclaimStones) { WiredTigerRecordStore* wtrs = static_cast<WiredTigerRecordStore*>(rs.get()); WiredTigerRecordStore::OplogStones* oplogStones = wtrs->oplogStones(); + { + ServiceContext::UniqueOperationContext opCtx(harnessHelper->newOperationContext()); + ASSERT_OK(wtrs->updateCappedSize(opCtx.get(), 230U)); + } + oplogStones->setMinBytesPerStone(100); - oplogStones->setNumStonesToKeep(2U); { ServiceContext::UniqueOperationContext opCtx(harnessHelper->newOperationContext()); @@ -1062,7 +1066,7 @@ TEST(WiredTigerRecordStoreTest, OplogStones_ReclaimStones) { ASSERT_EQ(0, oplogStones->currentBytes()); } - // Truncate a stone when number of stones to keep is exceeded. + // Truncate a stone when cappedMaxSize is exceeded. { ServiceContext::UniqueOperationContext opCtx(harnessHelper->newOperationContext()); @@ -1095,71 +1099,27 @@ TEST(WiredTigerRecordStoreTest, OplogStones_ReclaimStones) { wtrs->reclaimOplog(opCtx.get()); - ASSERT_EQ(3, rs->numRecords(opCtx.get())); - ASSERT_EQ(320, rs->dataSize(opCtx.get())); - ASSERT_EQ(2U, oplogStones->numStones()); + ASSERT_EQ(2, rs->numRecords(opCtx.get())); + ASSERT_EQ(190, rs->dataSize(opCtx.get())); + ASSERT_EQ(1U, oplogStones->numStones()); ASSERT_EQ(1, oplogStones->currentRecords()); ASSERT_EQ(50, oplogStones->currentBytes()); } - // No-op if the number of oplog stones is less than or equal to the number of stones to keep. + // No-op if dataSize <= cappedMaxSize. { ServiceContext::UniqueOperationContext opCtx(harnessHelper->newOperationContext()); wtrs->reclaimOplog(opCtx.get()); - ASSERT_EQ(3, rs->numRecords(opCtx.get())); - ASSERT_EQ(320, rs->dataSize(opCtx.get())); - ASSERT_EQ(2U, oplogStones->numStones()); + ASSERT_EQ(2, rs->numRecords(opCtx.get())); + ASSERT_EQ(190, rs->dataSize(opCtx.get())); + ASSERT_EQ(1U, oplogStones->numStones()); ASSERT_EQ(1, oplogStones->currentRecords()); ASSERT_EQ(50, oplogStones->currentBytes()); } } -// Verify that oplog stones are not reclaimed even if the size of the record store exceeds -// 'cappedMaxSize'. -TEST(WiredTigerRecordStoreTest, OplogStones_ExceedCappedMaxSize) { - std::unique_ptr<RecordStoreHarnessHelper> harnessHelper = newRecordStoreHarnessHelper(); - - const int64_t cappedMaxSize = 256; - unique_ptr<RecordStore> rs( - harnessHelper->newCappedRecordStore("local.oplog.stones", cappedMaxSize, -1)); - - WiredTigerRecordStore* wtrs = static_cast<WiredTigerRecordStore*>(rs.get()); - WiredTigerRecordStore::OplogStones* oplogStones = wtrs->oplogStones(); - - oplogStones->setMinBytesPerStone(100); - oplogStones->setNumStonesToKeep(10U); - - { - ServiceContext::UniqueOperationContext opCtx(harnessHelper->newOperationContext()); - - ASSERT_EQ(insertBSONWithSize(opCtx.get(), rs.get(), Timestamp(1, 1), 100), RecordId(1, 1)); - ASSERT_EQ(insertBSONWithSize(opCtx.get(), rs.get(), Timestamp(1, 2), 110), RecordId(1, 2)); - ASSERT_EQ(insertBSONWithSize(opCtx.get(), rs.get(), Timestamp(1, 3), 120), RecordId(1, 3)); - - ASSERT_EQ(3, rs->numRecords(opCtx.get())); - ASSERT_EQ(330, rs->dataSize(opCtx.get())); - ASSERT_EQ(3U, oplogStones->numStones()); - ASSERT_EQ(0, oplogStones->currentRecords()); - ASSERT_EQ(0, oplogStones->currentBytes()); - } - - // Shouldn't truncate a stone when the number of oplog stones is less than the number of stones - // to keep, even though the size of the record store exceeds 'cappedMaxSize'. - { - ServiceContext::UniqueOperationContext opCtx(harnessHelper->newOperationContext()); - - wtrs->reclaimOplog(opCtx.get()); - - ASSERT_EQ(3, rs->numRecords(opCtx.get())); - ASSERT_EQ(330, rs->dataSize(opCtx.get())); - ASSERT_EQ(3U, oplogStones->numStones()); - ASSERT_EQ(0, oplogStones->currentRecords()); - ASSERT_EQ(0, oplogStones->currentBytes()); - } -} - // Verify that an oplog stone isn't created if it would cause the logical representation of the // records to not be in increasing order. TEST(WiredTigerRecordStoreTest, OplogStones_AscendingOrder) { |