summaryrefslogtreecommitdiff
path: root/src/mongo/db
diff options
context:
space:
mode:
authorKyle Suarez <kyle.suarez@mongodb.com>2018-04-07 10:52:37 -0400
committerKyle Suarez <kyle.suarez@mongodb.com>2018-04-07 10:52:37 -0400
commitfae36f1444627d28bd18e7395962078a729b940a (patch)
treef6a9a94ec8e528eef5c4bb2d404a5997e40c2966 /src/mongo/db
parent47e4b6b791cce13a36ea499a9311d54f100412ee (diff)
downloadmongo-fae36f1444627d28bd18e7395962078a729b940a.tar.gz
SERVER-33488 conditionally update WT size metadata during startup recovery
Diffstat (limited to 'src/mongo/db')
-rw-r--r--src/mongo/db/SConscript1
-rw-r--r--src/mongo/db/repl/replication_recovery.cpp11
-rw-r--r--src/mongo/db/server_recovery.cpp71
-rw-r--r--src/mongo/db/server_recovery.h90
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp32
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h20
6 files changed, 224 insertions, 1 deletions
diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript
index 17add6e4bf7..74c0191286c 100644
--- a/src/mongo/db/SConscript
+++ b/src/mongo/db/SConscript
@@ -482,6 +482,7 @@ env.Library(
'service_context.cpp',
'service_context_noop.cpp',
'service_context_registrar.cpp',
+ 'server_recovery.cpp',
'unclean_shutdown.cpp',
],
LIBDEPS=[
diff --git a/src/mongo/db/repl/replication_recovery.cpp b/src/mongo/db/repl/replication_recovery.cpp
index 16adfd39966..3ddf34a9e21 100644
--- a/src/mongo/db/repl/replication_recovery.cpp
+++ b/src/mongo/db/repl/replication_recovery.cpp
@@ -38,6 +38,7 @@
#include "mongo/db/repl/replication_consistency_markers_impl.h"
#include "mongo/db/repl/storage_interface.h"
#include "mongo/db/repl/sync_tail.h"
+#include "mongo/db/server_recovery.h"
#include "mongo/db/session.h"
#include "mongo/util/log.h"
@@ -55,6 +56,16 @@ void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx,
return; // Initial Sync will take over so no cleanup is needed.
}
+ const auto serviceCtx = getGlobalServiceContext();
+ inReplicationRecovery(serviceCtx) = true;
+ ON_BLOCK_EXIT([serviceCtx] {
+ invariant(
+ inReplicationRecovery(serviceCtx),
+ "replication recovery flag is unexpectedly unset when exiting recoverFromOplog()");
+ inReplicationRecovery(serviceCtx) = false;
+ sizeRecoveryState(serviceCtx).clearStateAfterRecovery();
+ });
+
const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx);
if (!truncateAfterPoint.isNull()) {
log() << "Removing unapplied entries starting at: " << truncateAfterPoint.toBSON();
diff --git a/src/mongo/db/server_recovery.cpp b/src/mongo/db/server_recovery.cpp
new file mode 100644
index 00000000000..4f4f46f0293
--- /dev/null
+++ b/src/mongo/db/server_recovery.cpp
@@ -0,0 +1,71 @@
+/**
+ * Copyright (C) 2018 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects
+ * for all of the code used other than as permitted herein. If you modify
+ * file(s) with this exception, you may extend this exception to your
+ * version of the file(s), but you are not obligated to do so. If you do not
+ * wish to do so, delete this exception statement from your version. If you
+ * delete this exception statement from all source files in the program,
+ * then also delete it in the license file.
+ */
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/server_recovery.h"
+
+#include "mongo/db/namespace_string.h"
+
+namespace mongo {
+namespace {
+const auto getInReplicationRecovery = ServiceContext::declareDecoration<bool>();
+const auto getSizeRecoveryState = ServiceContext::declareDecoration<SizeRecoveryState>();
+} // namespace
+
+bool SizeRecoveryState::collectionNeedsSizeAdjustment(const std::string& ns) const {
+ if (!inReplicationRecovery(getGlobalServiceContext())) {
+ return true;
+ }
+
+ if (NamespaceString::oplog(ns)) {
+ return true;
+ }
+
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+ return _collectionsAlwaysNeedingSizeAdjustment.count(ns) > 0;
+}
+
+void SizeRecoveryState::markCollectionAsAlwaysNeedsSizeAdjustment(const std::string& ns) {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+ _collectionsAlwaysNeedingSizeAdjustment.insert(ns);
+}
+
+void SizeRecoveryState::clearStateAfterRecovery() {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+ _collectionsAlwaysNeedingSizeAdjustment.clear();
+}
+} // namespace mongo
+
+bool& mongo::inReplicationRecovery(ServiceContext* serviceCtx) {
+ return getInReplicationRecovery(serviceCtx);
+}
+
+mongo::SizeRecoveryState& mongo::sizeRecoveryState(ServiceContext* serviceCtx) {
+ return getSizeRecoveryState(serviceCtx);
+}
diff --git a/src/mongo/db/server_recovery.h b/src/mongo/db/server_recovery.h
new file mode 100644
index 00000000000..0fcd6ebc55d
--- /dev/null
+++ b/src/mongo/db/server_recovery.h
@@ -0,0 +1,90 @@
+/**
+ * Copyright (C) 2018 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects
+ * for all of the code used other than as permitted herein. If you modify
+ * file(s) with this exception, you may extend this exception to your
+ * version of the file(s), but you are not obligated to do so. If you do not
+ * wish to do so, delete this exception statement from your version. If you
+ * delete this exception statement from all source files in the program,
+ * then also delete it in the license file.
+ */
+
+#pragma once
+
+#include <set>
+#include <string>
+
+#include "mongo/db/service_context.h"
+#include "mongo/stdx/mutex.h"
+
+namespace mongo {
+/**
+ * This class is for use with non-MMAPv1 storage engines that track record store sizes in catalog
+ * metadata.
+ *
+ * During normal server operation, we adjust the size metadata for all record stores. But when
+ * performing replication recovery, we avoid doing so, as we trust that the size metadata on disk is
+ * already correct with respect to the end state of recovery.
+ *
+ * However, there may be exceptions that require the server to adjust size metadata even during
+ * recovery. One such case is the oplog: during rollback, the oplog is truncated, and then recovery
+ * occurs using oplog entries after the common point from the sync source. The server will need to
+ * adjust the size metadata for the oplog namespace to ensure that the count of oplog entries is
+ * correct after rollback recovery.
+ *
+ * This class is responsible for keeping track of namespaces that require this special
+ * count adjustment.
+ */
+class SizeRecoveryState {
+public:
+ /**
+ * If replication recovery is ongoing, returns false unless 'ns' is the oplog namespace or has
+ * been specifically marked as requiring adjustment even during recovery.
+ *
+ * If the system is not currently undergoing replication recovery, always returns true.
+ */
+ bool collectionNeedsSizeAdjustment(const std::string& ns) const;
+
+ /**
+ * Mark 'ns' as always requiring size adjustment, even if replication recovery is ongoing.
+ */
+ void markCollectionAsAlwaysNeedsSizeAdjustment(const std::string& ns);
+
+ /**
+ * Clears all internal state. This method should be called when replication recovery ends.
+ */
+ void clearStateAfterRecovery();
+
+private:
+ mutable stdx::mutex _mutex;
+ std::set<std::string> _collectionsAlwaysNeedingSizeAdjustment;
+};
+
+/**
+ * Returns a mutable reference to the single SizeRecoveryState associated with 'serviceCtx'.
+ */
+SizeRecoveryState& sizeRecoveryState(ServiceContext* serviceCtx);
+
+/**
+ * Returns a mutable reference to a boolean decoration on 'serviceCtx', which indicates whether or
+ * not the server is currently undergoing replication recovery.
+ */
+bool& inReplicationRecovery(ServiceContext* serviceCtx);
+} // namespace mongo
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
index 6a1be102524..21b14a20734 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
@@ -30,6 +30,8 @@
*/
#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage
+#define LOG_FOR_RECOVERY(level) \
+ MONGO_LOG_COMPONENT(level, ::mongo::logger::LogComponent::kStorageRecovery)
#include "mongo/platform/basic.h"
@@ -45,6 +47,7 @@
#include "mongo/db/namespace_string.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/repl/repl_settings.h"
+#include "mongo/db/server_recovery.h"
#include "mongo/db/service_context.h"
#include "mongo/db/storage/oplog_hack.h"
#include "mongo/db/storage/wiredtiger/wiredtiger_customization_hooks.h"
@@ -654,6 +657,17 @@ WiredTigerRecordStore::WiredTigerRecordStore(WiredTigerKVEngine* kvEngine,
if (_isOplog) {
checkOplogFormatVersion(ctx, _uri);
}
+
+ // Most record stores will not have their size metadata adjusted during replication recovery.
+ // However, if this record store was created during the recovery process, we will need to keep
+ // track of size adjustments for any writes applied to it during recovery.
+ const auto serviceCtx = getGlobalServiceContext();
+ if (inReplicationRecovery(serviceCtx)) {
+ LOG_FOR_RECOVERY(2)
+ << "Marking newly-created record store as needing size adjustment during recovery. ns: "
+ << ns() << ", ident: " << _uri;
+ sizeRecoveryState(serviceCtx).markCollectionAsAlwaysNeedsSizeAdjustment(ns());
+ }
}
WiredTigerRecordStore::~WiredTigerRecordStore() {
@@ -703,8 +717,18 @@ void WiredTigerRecordStore::postConstructorInit(OperationContext* opCtx) {
} while ((record = cursor->next()));
}
} else {
+ // We found no records in this collection; however, there may actually be documents present
+ // if writes to this collection were not included in the stable checkpoint the last time
+ // this node shut down. We set the data size and the record count to zero, but will adjust
+ // these if writes are played during startup recovery.
+ LOG_FOR_RECOVERY(2) << "Record store was empty; setting count metadata to zero but marking "
+ "record store as needing size adjustment during recovery. ns: "
+ << ns() << ", ident: " << _uri;
+ sizeRecoveryState(getGlobalServiceContext())
+ .markCollectionAsAlwaysNeedsSizeAdjustment(ns());
_dataSize.store(0);
_numRecords.store(0);
+
// Need to start at 1 so we are always higher than RecordId::min()
_nextIdNum.store(1);
if (_sizeStorer)
@@ -1585,6 +1609,10 @@ private:
};
void WiredTigerRecordStore::_changeNumRecords(OperationContext* opCtx, int64_t diff) {
+ if (!sizeRecoveryState(getGlobalServiceContext()).collectionNeedsSizeAdjustment(ns())) {
+ return;
+ }
+
opCtx->recoveryUnit()->registerChange(new NumRecordsChange(this, diff));
if (_numRecords.fetchAndAdd(diff) < 0)
_numRecords.store(std::max(diff, int64_t(0)));
@@ -1604,6 +1632,10 @@ private:
};
void WiredTigerRecordStore::_increaseDataSize(OperationContext* opCtx, int64_t amount) {
+ if (!sizeRecoveryState(getGlobalServiceContext()).collectionNeedsSizeAdjustment(ns())) {
+ return;
+ }
+
if (opCtx)
opCtx->recoveryUnit()->registerChange(new DataSizeChange(this, amount));
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h
index 6ff78eae68a..d0dc5e6cfd2 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h
@@ -296,9 +296,27 @@ private:
RecordId _nextId();
void _setId(RecordId id);
bool cappedAndNeedDelete() const;
+ RecordData _getData(const WiredTigerCursor& cursor) const;
+
+ /**
+ * Adjusts the record count and data size metadata for this record store, respectively. These
+ * functions consult the SizeRecoveryState to determine whether or not to actually change the
+ * size metadata if the server is undergoing recovery.
+ *
+ * For most record stores, we will not update the size metadata during recovery, as we trust
+ * that the values in the SizeStorer are accurate with respect to the end state of recovery.
+ * However, there are two exceptions:
+ *
+ * 1. When a record store is created as part of the recovery process. The SizeStorer will have
+ * no information about that newly-created ident.
+ * 2. When a record store is created at startup but constains no records as of the stable
+ * checkpoint timestamp. In this scenario, we will assume that the record store has a size
+ * of zero and will discard all cached size metadata. This assumption is incorrect if there
+ * are pending writes to this ident as part of the recovery process, and so we must
+ * always adjust size metadata for these idents.
+ */
void _changeNumRecords(OperationContext* opCtx, int64_t diff);
void _increaseDataSize(OperationContext* opCtx, int64_t amount);
- RecordData _getData(const WiredTigerCursor& cursor) const;
const std::string _uri;