diff options
author | Kyle Suarez <kyle.suarez@mongodb.com> | 2018-04-07 10:52:37 -0400 |
---|---|---|
committer | Kyle Suarez <kyle.suarez@mongodb.com> | 2018-04-07 10:52:37 -0400 |
commit | fae36f1444627d28bd18e7395962078a729b940a (patch) | |
tree | f6a9a94ec8e528eef5c4bb2d404a5997e40c2966 /src/mongo/db | |
parent | 47e4b6b791cce13a36ea499a9311d54f100412ee (diff) | |
download | mongo-fae36f1444627d28bd18e7395962078a729b940a.tar.gz |
SERVER-33488 conditionally update WT size metadata during startup recovery
Diffstat (limited to 'src/mongo/db')
-rw-r--r-- | src/mongo/db/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_recovery.cpp | 11 | ||||
-rw-r--r-- | src/mongo/db/server_recovery.cpp | 71 | ||||
-rw-r--r-- | src/mongo/db/server_recovery.h | 90 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp | 32 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h | 20 |
6 files changed, 224 insertions, 1 deletions
diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript index 17add6e4bf7..74c0191286c 100644 --- a/src/mongo/db/SConscript +++ b/src/mongo/db/SConscript @@ -482,6 +482,7 @@ env.Library( 'service_context.cpp', 'service_context_noop.cpp', 'service_context_registrar.cpp', + 'server_recovery.cpp', 'unclean_shutdown.cpp', ], LIBDEPS=[ diff --git a/src/mongo/db/repl/replication_recovery.cpp b/src/mongo/db/repl/replication_recovery.cpp index 16adfd39966..3ddf34a9e21 100644 --- a/src/mongo/db/repl/replication_recovery.cpp +++ b/src/mongo/db/repl/replication_recovery.cpp @@ -38,6 +38,7 @@ #include "mongo/db/repl/replication_consistency_markers_impl.h" #include "mongo/db/repl/storage_interface.h" #include "mongo/db/repl/sync_tail.h" +#include "mongo/db/server_recovery.h" #include "mongo/db/session.h" #include "mongo/util/log.h" @@ -55,6 +56,16 @@ void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx, return; // Initial Sync will take over so no cleanup is needed. } + const auto serviceCtx = getGlobalServiceContext(); + inReplicationRecovery(serviceCtx) = true; + ON_BLOCK_EXIT([serviceCtx] { + invariant( + inReplicationRecovery(serviceCtx), + "replication recovery flag is unexpectedly unset when exiting recoverFromOplog()"); + inReplicationRecovery(serviceCtx) = false; + sizeRecoveryState(serviceCtx).clearStateAfterRecovery(); + }); + const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx); if (!truncateAfterPoint.isNull()) { log() << "Removing unapplied entries starting at: " << truncateAfterPoint.toBSON(); diff --git a/src/mongo/db/server_recovery.cpp b/src/mongo/db/server_recovery.cpp new file mode 100644 index 00000000000..4f4f46f0293 --- /dev/null +++ b/src/mongo/db/server_recovery.cpp @@ -0,0 +1,71 @@ +/** + * Copyright (C) 2018 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects + * for all of the code used other than as permitted herein. If you modify + * file(s) with this exception, you may extend this exception to your + * version of the file(s), but you are not obligated to do so. If you do not + * wish to do so, delete this exception statement from your version. If you + * delete this exception statement from all source files in the program, + * then also delete it in the license file. + */ + +#include "mongo/platform/basic.h" + +#include "mongo/db/server_recovery.h" + +#include "mongo/db/namespace_string.h" + +namespace mongo { +namespace { +const auto getInReplicationRecovery = ServiceContext::declareDecoration<bool>(); +const auto getSizeRecoveryState = ServiceContext::declareDecoration<SizeRecoveryState>(); +} // namespace + +bool SizeRecoveryState::collectionNeedsSizeAdjustment(const std::string& ns) const { + if (!inReplicationRecovery(getGlobalServiceContext())) { + return true; + } + + if (NamespaceString::oplog(ns)) { + return true; + } + + stdx::lock_guard<stdx::mutex> lock(_mutex); + return _collectionsAlwaysNeedingSizeAdjustment.count(ns) > 0; +} + +void SizeRecoveryState::markCollectionAsAlwaysNeedsSizeAdjustment(const std::string& ns) { + stdx::lock_guard<stdx::mutex> lock(_mutex); + _collectionsAlwaysNeedingSizeAdjustment.insert(ns); +} + +void SizeRecoveryState::clearStateAfterRecovery() { + stdx::lock_guard<stdx::mutex> lock(_mutex); + _collectionsAlwaysNeedingSizeAdjustment.clear(); +} +} // namespace mongo + +bool& mongo::inReplicationRecovery(ServiceContext* serviceCtx) { + return getInReplicationRecovery(serviceCtx); +} + +mongo::SizeRecoveryState& mongo::sizeRecoveryState(ServiceContext* serviceCtx) { + return getSizeRecoveryState(serviceCtx); +} diff --git a/src/mongo/db/server_recovery.h b/src/mongo/db/server_recovery.h new file mode 100644 index 00000000000..0fcd6ebc55d --- /dev/null +++ b/src/mongo/db/server_recovery.h @@ -0,0 +1,90 @@ +/** + * Copyright (C) 2018 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects + * for all of the code used other than as permitted herein. If you modify + * file(s) with this exception, you may extend this exception to your + * version of the file(s), but you are not obligated to do so. If you do not + * wish to do so, delete this exception statement from your version. If you + * delete this exception statement from all source files in the program, + * then also delete it in the license file. + */ + +#pragma once + +#include <set> +#include <string> + +#include "mongo/db/service_context.h" +#include "mongo/stdx/mutex.h" + +namespace mongo { +/** + * This class is for use with non-MMAPv1 storage engines that track record store sizes in catalog + * metadata. + * + * During normal server operation, we adjust the size metadata for all record stores. But when + * performing replication recovery, we avoid doing so, as we trust that the size metadata on disk is + * already correct with respect to the end state of recovery. + * + * However, there may be exceptions that require the server to adjust size metadata even during + * recovery. One such case is the oplog: during rollback, the oplog is truncated, and then recovery + * occurs using oplog entries after the common point from the sync source. The server will need to + * adjust the size metadata for the oplog namespace to ensure that the count of oplog entries is + * correct after rollback recovery. + * + * This class is responsible for keeping track of namespaces that require this special + * count adjustment. + */ +class SizeRecoveryState { +public: + /** + * If replication recovery is ongoing, returns false unless 'ns' is the oplog namespace or has + * been specifically marked as requiring adjustment even during recovery. + * + * If the system is not currently undergoing replication recovery, always returns true. + */ + bool collectionNeedsSizeAdjustment(const std::string& ns) const; + + /** + * Mark 'ns' as always requiring size adjustment, even if replication recovery is ongoing. + */ + void markCollectionAsAlwaysNeedsSizeAdjustment(const std::string& ns); + + /** + * Clears all internal state. This method should be called when replication recovery ends. + */ + void clearStateAfterRecovery(); + +private: + mutable stdx::mutex _mutex; + std::set<std::string> _collectionsAlwaysNeedingSizeAdjustment; +}; + +/** + * Returns a mutable reference to the single SizeRecoveryState associated with 'serviceCtx'. + */ +SizeRecoveryState& sizeRecoveryState(ServiceContext* serviceCtx); + +/** + * Returns a mutable reference to a boolean decoration on 'serviceCtx', which indicates whether or + * not the server is currently undergoing replication recovery. + */ +bool& inReplicationRecovery(ServiceContext* serviceCtx); +} // namespace mongo diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp index 6a1be102524..21b14a20734 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp @@ -30,6 +30,8 @@ */ #define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage +#define LOG_FOR_RECOVERY(level) \ + MONGO_LOG_COMPONENT(level, ::mongo::logger::LogComponent::kStorageRecovery) #include "mongo/platform/basic.h" @@ -45,6 +47,7 @@ #include "mongo/db/namespace_string.h" #include "mongo/db/operation_context.h" #include "mongo/db/repl/repl_settings.h" +#include "mongo/db/server_recovery.h" #include "mongo/db/service_context.h" #include "mongo/db/storage/oplog_hack.h" #include "mongo/db/storage/wiredtiger/wiredtiger_customization_hooks.h" @@ -654,6 +657,17 @@ WiredTigerRecordStore::WiredTigerRecordStore(WiredTigerKVEngine* kvEngine, if (_isOplog) { checkOplogFormatVersion(ctx, _uri); } + + // Most record stores will not have their size metadata adjusted during replication recovery. + // However, if this record store was created during the recovery process, we will need to keep + // track of size adjustments for any writes applied to it during recovery. + const auto serviceCtx = getGlobalServiceContext(); + if (inReplicationRecovery(serviceCtx)) { + LOG_FOR_RECOVERY(2) + << "Marking newly-created record store as needing size adjustment during recovery. ns: " + << ns() << ", ident: " << _uri; + sizeRecoveryState(serviceCtx).markCollectionAsAlwaysNeedsSizeAdjustment(ns()); + } } WiredTigerRecordStore::~WiredTigerRecordStore() { @@ -703,8 +717,18 @@ void WiredTigerRecordStore::postConstructorInit(OperationContext* opCtx) { } while ((record = cursor->next())); } } else { + // We found no records in this collection; however, there may actually be documents present + // if writes to this collection were not included in the stable checkpoint the last time + // this node shut down. We set the data size and the record count to zero, but will adjust + // these if writes are played during startup recovery. + LOG_FOR_RECOVERY(2) << "Record store was empty; setting count metadata to zero but marking " + "record store as needing size adjustment during recovery. ns: " + << ns() << ", ident: " << _uri; + sizeRecoveryState(getGlobalServiceContext()) + .markCollectionAsAlwaysNeedsSizeAdjustment(ns()); _dataSize.store(0); _numRecords.store(0); + // Need to start at 1 so we are always higher than RecordId::min() _nextIdNum.store(1); if (_sizeStorer) @@ -1585,6 +1609,10 @@ private: }; void WiredTigerRecordStore::_changeNumRecords(OperationContext* opCtx, int64_t diff) { + if (!sizeRecoveryState(getGlobalServiceContext()).collectionNeedsSizeAdjustment(ns())) { + return; + } + opCtx->recoveryUnit()->registerChange(new NumRecordsChange(this, diff)); if (_numRecords.fetchAndAdd(diff) < 0) _numRecords.store(std::max(diff, int64_t(0))); @@ -1604,6 +1632,10 @@ private: }; void WiredTigerRecordStore::_increaseDataSize(OperationContext* opCtx, int64_t amount) { + if (!sizeRecoveryState(getGlobalServiceContext()).collectionNeedsSizeAdjustment(ns())) { + return; + } + if (opCtx) opCtx->recoveryUnit()->registerChange(new DataSizeChange(this, amount)); diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h index 6ff78eae68a..d0dc5e6cfd2 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h @@ -296,9 +296,27 @@ private: RecordId _nextId(); void _setId(RecordId id); bool cappedAndNeedDelete() const; + RecordData _getData(const WiredTigerCursor& cursor) const; + + /** + * Adjusts the record count and data size metadata for this record store, respectively. These + * functions consult the SizeRecoveryState to determine whether or not to actually change the + * size metadata if the server is undergoing recovery. + * + * For most record stores, we will not update the size metadata during recovery, as we trust + * that the values in the SizeStorer are accurate with respect to the end state of recovery. + * However, there are two exceptions: + * + * 1. When a record store is created as part of the recovery process. The SizeStorer will have + * no information about that newly-created ident. + * 2. When a record store is created at startup but constains no records as of the stable + * checkpoint timestamp. In this scenario, we will assume that the record store has a size + * of zero and will discard all cached size metadata. This assumption is incorrect if there + * are pending writes to this ident as part of the recovery process, and so we must + * always adjust size metadata for these idents. + */ void _changeNumRecords(OperationContext* opCtx, int64_t diff); void _increaseDataSize(OperationContext* opCtx, int64_t amount); - RecordData _getData(const WiredTigerCursor& cursor) const; const std::string _uri; |