SERVER-22766 wiredtiger engine support update oplogsize online

author: wolfkdy <kdy71107216@aliyun.com> 2017-03-04 16:10:58 +0800
committer: Geert Bosch <geert@mongodb.com> 2017-07-05 17:44:39 -0400
commit: 93beb0234eba9dc58ab6070ad472022f96e019e6 (patch)
tree: 2187cdfef5e3b602fb736d175677e4eb7db562f5 /src/mongo/db/storage
parent: 2aefd80d1acea065c77bd3bd69abf686a27ae3e0 (diff)
download: mongo-93beb0234eba9dc58ab6070ad472022f96e019e6.tar.gz
9 files changed, 93 insertions, 84 deletions
diff --git a/src/mongo/db/storage/kv/kv_collection_catalog_entry.cpp b/src/mongo/db/storage/kv/kv_collection_catalog_entry.cpp
index e96cddbb1a7..c775ada1a23 100644
--- a/src/mongo/db/storage/kv/kv_collection_catalog_entry.cpp
+++ b/src/mongo/db/storage/kv/kv_collection_catalog_entry.cpp
@@ -248,6 +248,12 @@ void KVCollectionCatalogEntry::updateValidator(OperationContext* opCtx,
     _catalog->putMetaData(opCtx, ns().toString(), md);
 }
 
+void KVCollectionCatalogEntry::updateCappedSize(OperationContext* opCtx, long long size) {
+    MetaData md = _getMetaData(opCtx);
+    md.options.cappedSize = size;
+    _catalog->putMetaData(opCtx, ns().toString(), md);
+}
+
 BSONCollectionCatalogEntry::MetaData KVCollectionCatalogEntry::_getMetaData(
     OperationContext* opCtx) const {
     return _catalog->getMetaData(opCtx, ns().toString());
diff --git a/src/mongo/db/storage/kv/kv_collection_catalog_entry.h b/src/mongo/db/storage/kv/kv_collection_catalog_entry.h
index c999bee1fe5..21f31691739 100644
--- a/src/mongo/db/storage/kv/kv_collection_catalog_entry.h
+++ b/src/mongo/db/storage/kv/kv_collection_catalog_entry.h
@@ -78,6 +78,8 @@ public:
                          StringData validationLevel,
                          StringData validationAction) final;
 
+    void updateCappedSize(OperationContext*, long long int) final;
+
     RecordStore* getRecordStore() {
         return _recordStore.get();
     }
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp
index 617a6739cfe..b37ca245298 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp
@@ -442,4 +442,9 @@ void NamespaceDetailsCollectionCatalogEntry::setNamespacesRecordId(OperationCont
         _namespacesRecordId = newId;
     }
 }
+
+void NamespaceDetailsCollectionCatalogEntry::updateCappedSize(OperationContext* opCtx,
+                                                              long long size) {
+    invariant(false);
+}
 }
diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h
index f4e4410e317..aff16b22093 100644
--- a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h
+++ b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h
@@ -102,6 +102,8 @@ public:
                          StringData validationLevel,
                          StringData validationAction) final;
 
+    void updateCappedSize(OperationContext* opCtx, long long size) final;
+
     // not part of interface, but available to my storage engine
 
     int _findIndexNumber(OperationContext* opCtx, StringData indexName) const;
diff --git a/src/mongo/db/storage/record_store.h b/src/mongo/db/storage/record_store.h
index a6c2c0d9a09..8f7dbe7919a 100644
--- a/src/mongo/db/storage/record_store.h
+++ b/src/mongo/db/storage/record_store.h
@@ -623,6 +623,14 @@ public:
                                         long long numRecords,
                                         long long dataSize) = 0;
 
+    /**
+     * used to support online change oplog size.
+     */
+    virtual Status updateCappedSize(OperationContext* opCtx, long long cappedSize) {
+        return Status(ErrorCodes::CommandNotSupported,
+                      "this storage engine does not support updateCappedSize");
+    }
+
 protected:
     std::string _ns;
 };
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
index 38da23da215..0f28ef6a71f 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
@@ -155,11 +155,11 @@ WiredTigerRecordStore::OplogStones::OplogStones(OperationContext* opCtx, WiredTi
     const unsigned long long kMaxStonesToKeep = 100ULL;
 
     unsigned long long numStones = maxSize / BSONObjMaxInternalSize;
-    _numStonesToKeep = std::min(kMaxStonesToKeep, std::max(kMinStonesToKeep, numStones));
-    _minBytesPerStone = maxSize / _numStonesToKeep;
+    size_t numStonesToKeep = std::min(kMaxStonesToKeep, std::max(kMinStonesToKeep, numStones));
+    _minBytesPerStone = maxSize / numStonesToKeep;
     invariant(_minBytesPerStone > 0);
 
-    _calculateStones(opCtx);
+    _calculateStones(opCtx, numStonesToKeep);
     _pokeReclaimThreadIfNeeded();  // Reclaim stones if over the limit.
 }
 
@@ -179,8 +179,14 @@ void WiredTigerRecordStore::OplogStones::kill() {
 void WiredTigerRecordStore::OplogStones::awaitHasExcessStonesOrDead() {
     // Wait until kill() is called or there are too many oplog stones.
     stdx::unique_lock<stdx::mutex> lock(_oplogReclaimMutex);
-    while (!_isDead && !hasExcessStones()) {
-        MONGO_IDLE_THREAD_BLOCK;
+    while (!_isDead) {
+        {
+            MONGO_IDLE_THREAD_BLOCK;
+            stdx::lock_guard<stdx::mutex> lk(_mutex);
+            if (hasExcessStones_inlock()) {
+                break;
+            }
+        }
         _oplogReclaimCv.wait(lock);
     }
 }
@@ -189,7 +195,7 @@ boost::optional<WiredTigerRecordStore::OplogStones::Stone>
 WiredTigerRecordStore::OplogStones::peekOldestStoneIfNeeded() const {
     stdx::lock_guard<stdx::mutex> lk(_mutex);
 
-    if (!hasExcessStones()) {
+    if (!hasExcessStones_inlock()) {
         return {};
     }
 
@@ -221,6 +227,7 @@ void WiredTigerRecordStore::OplogStones::createNewStoneIfNeeded(RecordId lastRec
         return;
     }
 
+    LOG(2) << "create new oplogStone, current stones:" << _stones.size();
     OplogStones::Stone stone = {_currentRecords.swap(0), _currentBytes.swap(0), lastRecord};
     _stones.push_back(stone);
 
@@ -279,17 +286,8 @@ void WiredTigerRecordStore::OplogStones::setMinBytesPerStone(int64_t size) {
     _minBytesPerStone = size;
 }
 
-void WiredTigerRecordStore::OplogStones::setNumStonesToKeep(size_t numStones) {
-    invariant(numStones > 0);
-
-    stdx::lock_guard<stdx::mutex> lk(_mutex);
-
-    // Only allow changing the number of stones to keep if no data has been inserted.
-    invariant(_stones.size() == 0 && _currentRecords.load() == 0);
-    _numStonesToKeep = numStones;
-}
-
-void WiredTigerRecordStore::OplogStones::_calculateStones(OperationContext* opCtx) {
+void WiredTigerRecordStore::OplogStones::_calculateStones(OperationContext* opCtx,
+                                                          size_t numStonesToKeep) {
     long long numRecords = _rs->numRecords(opCtx);
     long long dataSize = _rs->dataSize(opCtx);
 
@@ -304,7 +302,7 @@ void WiredTigerRecordStore::OplogStones::_calculateStones(OperationContext* opCt
     // oplog to determine where to put down stones.
     if (numRecords <= 0 || dataSize <= 0 ||
         uint64_t(numRecords) <
-            kMinSampleRatioForRandCursor * kRandomSamplesPerStone * _numStonesToKeep) {
+            kMinSampleRatioForRandCursor * kRandomSamplesPerStone * numStonesToKeep) {
         _calculateStonesByScanning(opCtx);
         return;
     }
@@ -427,11 +425,23 @@ void WiredTigerRecordStore::OplogStones::_calculateStonesBySampling(OperationCon
 }
 
 void WiredTigerRecordStore::OplogStones::_pokeReclaimThreadIfNeeded() {
-    if (hasExcessStones()) {
+    if (hasExcessStones_inlock()) {
         _oplogReclaimCv.notify_one();
     }
 }
 
+void WiredTigerRecordStore::OplogStones::adjust(int64_t maxSize) {
+    stdx::lock_guard<stdx::mutex> lk(_mutex);
+    const unsigned long long kMinStonesToKeep = 10ULL;
+    const unsigned long long kMaxStonesToKeep = 100ULL;
+
+    unsigned long long numStones = maxSize / BSONObjMaxInternalSize;
+    size_t numStonesToKeep = std::min(kMaxStonesToKeep, std::max(kMinStonesToKeep, numStones));
+    _minBytesPerStone = maxSize / numStonesToKeep;
+    invariant(_minBytesPerStone > 0);
+    _pokeReclaimThreadIfNeeded();
+}
+
 StatusWith<std::string> WiredTigerRecordStore::parseOptionsField(const BSONObj options) {
     StringBuilder ss;
     BSONForEach(elem, options) {
@@ -2098,4 +2108,15 @@ void WiredTigerRecordStorePrefixedCursor::initCursorToBeginning() {
     }
 }
 
+Status WiredTigerRecordStore::updateCappedSize(OperationContext* opCtx, long long cappedSize) {
+    if (_cappedMaxSize == cappedSize) {
+        return Status::OK();
+    }
+    _cappedMaxSize = cappedSize;
+    if (_oplogStones) {
+        _oplogStones->adjust(cappedSize);
+    }
+    return Status::OK();
+}
+
 }  // namespace mongo
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h
index 726febce42c..aca92fbea1d 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h
@@ -218,6 +218,8 @@ public:
 
     void waitForAllEarlierOplogWritesToBeVisible(OperationContext* opCtx) const override;
 
+    Status updateCappedSize(OperationContext* opCtx, long long cappedSize) final;
+
     bool isOplog() const {
         return _isOplog;
     }
@@ -317,7 +319,7 @@ private:
     const bool _isEphemeral;
     // True if the namespace of this record store starts with "local.oplog.", and false otherwise.
     const bool _isOplog;
-    const int64_t _cappedMaxSize;
+    int64_t _cappedMaxSize;
     const int64_t _cappedMaxSizeSlack;  // when to start applying backpressure
     const int64_t _cappedMaxDocs;
     RecordId _cappedFirstRecord;
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_oplog_stones.h b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_oplog_stones.h
index f42ade90db8..dfd3dddb902 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_oplog_stones.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_oplog_stones.h
@@ -57,8 +57,14 @@ public:
 
     void kill();
 
-    bool hasExcessStones() const {
-        return _stones.size() > _numStonesToKeep;
+    bool hasExcessStones_inlock() const {
+        int64_t total_bytes = 0;
+        for (std::deque<OplogStones::Stone>::const_iterator it = _stones.begin();
+             it != _stones.end();
+             ++it) {
+            total_bytes += it->bytes;
+        }
+        return total_bytes > _rs->cappedMaxSize();
     }
 
     void awaitHasExcessStonesOrDead();
@@ -81,6 +87,9 @@ public:
                                               int64_t bytesRemoved,
                                               RecordId firstRemovedId);
 
+    // Resize oplog size
+    void adjust(int64_t maxSize);
+
     // The start point of where to truncate next. Used by the background reclaim thread to
     // efficiently truncate records with WiredTiger by skipping over tombstones, etc.
     RecordId firstRecord;
@@ -104,13 +113,11 @@ public:
 
     void setMinBytesPerStone(int64_t size);
 
-    void setNumStonesToKeep(size_t numStones);
-
 private:
     class InsertChange;
     class TruncateChange;
 
-    void _calculateStones(OperationContext* opCtx);
+    void _calculateStones(OperationContext* opCtx, size_t size);
     void _calculateStonesByScanning(OperationContext* opCtx);
     void _calculateStonesBySampling(OperationContext* opCtx,
                                     int64_t estRecordsPerStone,
@@ -129,12 +136,8 @@ private:
     // database, and false otherwise.
     bool _isDead = false;
 
-    // Maximum number of stones to keep in the deque before the background reclaim thread should
-    // truncate the oldest ones. Does not include the stone currently being filled. This value
-    // should not be changed after initialization.
-    size_t _numStonesToKeep;
     // Minimum number of bytes the stone being filled should contain before it gets added to the
-    // deque of oplog stones. This value should not be changed after initialization.
+    // deque of oplog stones.
     int64_t _minBytesPerStone;
 
     AtomicInt64 _currentRecords;  // Number of records in the stone being filled.
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_test.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_test.cpp
index 0d15a514950..75b5c11e673 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_test.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store_test.cpp
@@ -1034,7 +1034,7 @@ TEST(WiredTigerRecordStoreTest, OplogStones_CappedTruncateAfter) {
     }
 }
 
-// Verify that oplog stones are reclaimed when the number of stones to keep is exceeded.
+// Verify that oplog stones are reclaimed when cappedMaxSize is exceeded.
 TEST(WiredTigerRecordStoreTest, OplogStones_ReclaimStones) {
     std::unique_ptr<RecordStoreHarnessHelper> harnessHelper = newRecordStoreHarnessHelper();
 
@@ -1045,8 +1045,12 @@ TEST(WiredTigerRecordStoreTest, OplogStones_ReclaimStones) {
     WiredTigerRecordStore* wtrs = static_cast<WiredTigerRecordStore*>(rs.get());
     WiredTigerRecordStore::OplogStones* oplogStones = wtrs->oplogStones();
 
+    {
+        ServiceContext::UniqueOperationContext opCtx(harnessHelper->newOperationContext());
+        ASSERT_OK(wtrs->updateCappedSize(opCtx.get(), 230U));
+    }
+
     oplogStones->setMinBytesPerStone(100);
-    oplogStones->setNumStonesToKeep(2U);
 
     {
         ServiceContext::UniqueOperationContext opCtx(harnessHelper->newOperationContext());
@@ -1062,7 +1066,7 @@ TEST(WiredTigerRecordStoreTest, OplogStones_ReclaimStones) {
         ASSERT_EQ(0, oplogStones->currentBytes());
     }
 
-    // Truncate a stone when number of stones to keep is exceeded.
+    // Truncate a stone when cappedMaxSize is exceeded.
     {
         ServiceContext::UniqueOperationContext opCtx(harnessHelper->newOperationContext());
 
@@ -1095,71 +1099,27 @@ TEST(WiredTigerRecordStoreTest, OplogStones_ReclaimStones) {
 
         wtrs->reclaimOplog(opCtx.get());
 
-        ASSERT_EQ(3, rs->numRecords(opCtx.get()));
-        ASSERT_EQ(320, rs->dataSize(opCtx.get()));
-        ASSERT_EQ(2U, oplogStones->numStones());
+        ASSERT_EQ(2, rs->numRecords(opCtx.get()));
+        ASSERT_EQ(190, rs->dataSize(opCtx.get()));
+        ASSERT_EQ(1U, oplogStones->numStones());
         ASSERT_EQ(1, oplogStones->currentRecords());
         ASSERT_EQ(50, oplogStones->currentBytes());
     }
 
-    // No-op if the number of oplog stones is less than or equal to the number of stones to keep.
+    // No-op if dataSize <= cappedMaxSize.
     {
         ServiceContext::UniqueOperationContext opCtx(harnessHelper->newOperationContext());
 
         wtrs->reclaimOplog(opCtx.get());
 
-        ASSERT_EQ(3, rs->numRecords(opCtx.get()));
-        ASSERT_EQ(320, rs->dataSize(opCtx.get()));
-        ASSERT_EQ(2U, oplogStones->numStones());
+        ASSERT_EQ(2, rs->numRecords(opCtx.get()));
+        ASSERT_EQ(190, rs->dataSize(opCtx.get()));
+        ASSERT_EQ(1U, oplogStones->numStones());
         ASSERT_EQ(1, oplogStones->currentRecords());
         ASSERT_EQ(50, oplogStones->currentBytes());
     }
 }
 
-// Verify that oplog stones are not reclaimed even if the size of the record store exceeds
-// 'cappedMaxSize'.
-TEST(WiredTigerRecordStoreTest, OplogStones_ExceedCappedMaxSize) {
-    std::unique_ptr<RecordStoreHarnessHelper> harnessHelper = newRecordStoreHarnessHelper();
-
-    const int64_t cappedMaxSize = 256;
-    unique_ptr<RecordStore> rs(
-        harnessHelper->newCappedRecordStore("local.oplog.stones", cappedMaxSize, -1));
-
-    WiredTigerRecordStore* wtrs = static_cast<WiredTigerRecordStore*>(rs.get());
-    WiredTigerRecordStore::OplogStones* oplogStones = wtrs->oplogStones();
-
-    oplogStones->setMinBytesPerStone(100);
-    oplogStones->setNumStonesToKeep(10U);
-
-    {
-        ServiceContext::UniqueOperationContext opCtx(harnessHelper->newOperationContext());
-
-        ASSERT_EQ(insertBSONWithSize(opCtx.get(), rs.get(), Timestamp(1, 1), 100), RecordId(1, 1));
-        ASSERT_EQ(insertBSONWithSize(opCtx.get(), rs.get(), Timestamp(1, 2), 110), RecordId(1, 2));
-        ASSERT_EQ(insertBSONWithSize(opCtx.get(), rs.get(), Timestamp(1, 3), 120), RecordId(1, 3));
-
-        ASSERT_EQ(3, rs->numRecords(opCtx.get()));
-        ASSERT_EQ(330, rs->dataSize(opCtx.get()));
-        ASSERT_EQ(3U, oplogStones->numStones());
-        ASSERT_EQ(0, oplogStones->currentRecords());
-        ASSERT_EQ(0, oplogStones->currentBytes());
-    }
-
-    // Shouldn't truncate a stone when the number of oplog stones is less than the number of stones
-    // to keep, even though the size of the record store exceeds 'cappedMaxSize'.
-    {
-        ServiceContext::UniqueOperationContext opCtx(harnessHelper->newOperationContext());
-
-        wtrs->reclaimOplog(opCtx.get());
-
-        ASSERT_EQ(3, rs->numRecords(opCtx.get()));
-        ASSERT_EQ(330, rs->dataSize(opCtx.get()));
-        ASSERT_EQ(3U, oplogStones->numStones());
-        ASSERT_EQ(0, oplogStones->currentRecords());
-        ASSERT_EQ(0, oplogStones->currentBytes());
-    }
-}
-
 // Verify that an oplog stone isn't created if it would cause the logical representation of the
 // records to not be in increasing order.
 TEST(WiredTigerRecordStoreTest, OplogStones_AscendingOrder) {
author	wolfkdy <kdy71107216@aliyun.com>	2017-03-04 16:10:58 +0800
committer	Geert Bosch <geert@mongodb.com>	2017-07-05 17:44:39 -0400
commit	93beb0234eba9dc58ab6070ad472022f96e019e6 (patch)
tree	2187cdfef5e3b602fb736d175677e4eb7db562f5 /src/mongo/db/storage
parent	2aefd80d1acea065c77bd3bd69abf686a27ae3e0 (diff)
download	mongo-93beb0234eba9dc58ab6070ad472022f96e019e6.tar.gz