summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKyle Suarez <kyle.suarez@mongodb.com>2018-04-07 10:52:37 -0400
committerKyle Suarez <kyle.suarez@mongodb.com>2018-04-07 10:52:37 -0400
commitfae36f1444627d28bd18e7395962078a729b940a (patch)
treef6a9a94ec8e528eef5c4bb2d404a5997e40c2966
parent47e4b6b791cce13a36ea499a9311d54f100412ee (diff)
downloadmongo-fae36f1444627d28bd18e7395962078a729b940a.tar.gz
SERVER-33488 conditionally update WT size metadata during startup recovery
-rw-r--r--jstests/auth/upgrade_noauth_to_keyfile.js6
-rw-r--r--jstests/replsets/recovery_after_clean_shutdown_but_not_all_writes_in_snapshot.js79
-rw-r--r--jstests/ssl/initial_sync1_x509.js2
-rw-r--r--jstests/ssl/ssl_cert_password.js19
-rw-r--r--jstests/ssl/upgrade_noauth_to_x509_ssl.js6
-rw-r--r--jstests/ssl/upgrade_to_ssl.js6
-rw-r--r--jstests/ssl/upgrade_to_x509_ssl.js6
-rw-r--r--src/mongo/db/SConscript1
-rw-r--r--src/mongo/db/repl/replication_recovery.cpp11
-rw-r--r--src/mongo/db/server_recovery.cpp71
-rw-r--r--src/mongo/db/server_recovery.h90
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp32
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h20
13 files changed, 325 insertions, 24 deletions
diff --git a/jstests/auth/upgrade_noauth_to_keyfile.js b/jstests/auth/upgrade_noauth_to_keyfile.js
index 464199ab745..9bf2ec115e6 100644
--- a/jstests/auth/upgrade_noauth_to_keyfile.js
+++ b/jstests/auth/upgrade_noauth_to_keyfile.js
@@ -38,13 +38,13 @@ TestData.skipGossipingClusterTime = true;
rstConn1.getDB('admin').createUser({user: 'root', pwd: 'root', roles: ['root']});
rstConn1.getDB('test').a.insert({a: 1, str: 'TESTTESTTEST'});
- assert.eq(1, rstConn1.getDB('test').a.find().itcount(), 'Error interacting with replSet');
+ assert.eq(1, rstConn1.getDB('test').a.count(), 'Error interacting with replSet');
print('=== UPGRADE noauth -> transitionToAuth/keyFile ===');
rst.upgradeSet(transitionToAuthOptions);
var rstConn2 = rst.getPrimary();
rstConn2.getDB('test').a.insert({a: 1, str: 'TESTTESTTEST'});
- assert.eq(2, rstConn2.getDB('test').a.find().itcount(), 'Error interacting with replSet');
+ assert.eq(2, rstConn2.getDB('test').a.count(), 'Error interacting with replSet');
print('=== UPGRADE transitionToAuth/keyFile -> keyFile ===');
rst.upgradeSet(keyFileOptions, 'root', 'root');
@@ -52,7 +52,7 @@ TestData.skipGossipingClusterTime = true;
// upgradeSet leaves its connections logged in as root
var rstConn3 = rst.getPrimary();
rstConn3.getDB('test').a.insert({a: 1, str: 'TESTTESTTEST'});
- assert.eq(3, rstConn3.getDB('test').a.find().itcount(), 'Error interacting with replSet');
+ assert.eq(3, rstConn3.getDB('test').a.count(), 'Error interacting with replSet');
rst.stopSet();
}());
diff --git a/jstests/replsets/recovery_after_clean_shutdown_but_not_all_writes_in_snapshot.js b/jstests/replsets/recovery_after_clean_shutdown_but_not_all_writes_in_snapshot.js
new file mode 100644
index 00000000000..45a005e255e
--- /dev/null
+++ b/jstests/replsets/recovery_after_clean_shutdown_but_not_all_writes_in_snapshot.js
@@ -0,0 +1,79 @@
+/**
+ * Tests that fast metadata counts are correct after replication recovery following a clean
+ * shutdown.
+ *
+ * @tags: [requires_persistence, requires_replication]
+ */
+(function() {
+ "use strict";
+
+ const rst = new ReplSetTest({
+ name: "recoveryAfterCleanShutdown",
+ nodes: 2,
+ nodeOptions:
+ {setParameter: {logComponentVerbosity: tojsononeline({storage: {recovery: 2}})}}
+ });
+ const nodes = rst.startSet();
+ rst.initiate();
+
+ const dbName = "recovery_clean_shutdown";
+ let primaryDB = rst.getPrimary().getDB(dbName);
+ const wMajority = {writeConcern: {w: "majority", wtimeout: ReplSetTest.kDefaultTimeoutMS}};
+
+ // Create a collection that will have all of its writes in the stable checkpoint.
+ const collAllStableWrites = "allWritesInStableCheckpoint";
+ assert.commandWorked(primaryDB[collAllStableWrites].insert({_id: "dan"}, wMajority));
+ assert.commandWorked(primaryDB[collAllStableWrites].insert({_id: "judah"}, wMajority));
+ assert.commandWorked(primaryDB[collAllStableWrites].insert({_id: "vessy"}, wMajority));
+ assert.commandWorked(primaryDB[collAllStableWrites].insert({_id: "kyle"}, wMajority));
+
+ // Set up a collection with some writes that make it into the stable checkpoint.
+ const collSomeStableWrites = "someWritesInStableCheckpoint";
+ assert.commandWorked(primaryDB[collSomeStableWrites].insert({_id: "erjon"}, wMajority));
+ assert.commandWorked(primaryDB[collSomeStableWrites].insert({_id: "jungsoo"}, wMajority));
+
+ // Set up a collection whose creation is in the stable checkpoint, but will have no stable
+ // writes.
+ const collNoStableWrites = "noWritesInStableCheckpoint";
+ assert.commandWorked(primaryDB[collNoStableWrites].runCommand("create", wMajority));
+
+ // Wait for all oplog entries to enter the stable checkpoint on all secondaries.
+ rst.awaitLastOpCommitted();
+
+ // Disable snapshotting on all members of the replica set so that further operations do not
+ // enter the majority snapshot.
+ nodes.forEach(node => assert.commandWorked(node.adminCommand(
+ {configureFailPoint: "disableSnapshotting", mode: "alwaysOn"})));
+ const w1 = {writeConcern: {w: 1, wtimeout: ReplSetTest.kDefaultTimeoutMS}};
+
+ // Set up a collection whose creation is not in the stable checkpoint.
+ const collNoStableCreation = "creationNotInStableCheckpoint";
+ assert.commandWorked(primaryDB[collNoStableCreation].runCommand("create", w1));
+
+ // Perform writes on collections that replicate to each node but do not enter the majority
+ // snapshot. These commands will be replayed during replication recovery during restart.
+ [collSomeStableWrites, collNoStableWrites, collNoStableCreation].forEach(
+ coll => assert.commandWorked(
+ primaryDB[coll].insert({_id: "insertedAfterSnapshottingDisabled"}, w1)));
+ rst.awaitReplication();
+
+ jsTestLog("Checking collection counts after snapshotting has been disabled");
+ rst.checkCollectionCounts();
+
+ // Perform a clean shutdown and restart. Note that the 'disableSnapshotting' failpoint will be
+ // unset on each node following the restart.
+ nodes.forEach(node => rst.restart(node));
+ rst.awaitNodesAgreeOnPrimary();
+ primaryDB = rst.getPrimary().getDB(dbName);
+
+ // Perform a majority write to ensure that both nodes agree on the majority commit point.
+ const collCreatedAfterRestart = "createdAfterRestart";
+ assert.commandWorked(
+ primaryDB[collCreatedAfterRestart].insert({_id: "insertedAfterRestart", wMajority}));
+
+ // Fast metadata count should be correct after restart in the face of a clean shutdown.
+ jsTestLog("Checking collection counts after clean restart of all nodes");
+ rst.checkCollectionCounts();
+
+ rst.stopSet();
+}());
diff --git a/jstests/ssl/initial_sync1_x509.js b/jstests/ssl/initial_sync1_x509.js
index 9cc2acdb295..c49ab37df5a 100644
--- a/jstests/ssl/initial_sync1_x509.js
+++ b/jstests/ssl/initial_sync1_x509.js
@@ -54,7 +54,7 @@ function runInitialSyncTest() {
bulk.insert({date: new Date(), x: i, str: "all the talk on the market"});
}
assert.writeOK(bulk.execute());
- print("total in foo: " + foo.bar.find().itcount());
+ print("total in foo: " + foo.bar.count());
print("4. Make sure synced");
replTest.awaitReplication();
diff --git a/jstests/ssl/ssl_cert_password.js b/jstests/ssl/ssl_cert_password.js
index 8c555fd822b..356c0603c84 100644
--- a/jstests/ssl/ssl_cert_password.js
+++ b/jstests/ssl/ssl_cert_password.js
@@ -4,6 +4,7 @@
// does not return error statuses to indicate an error.
// This test requires ssl support in mongo-tools
// @tags: [requires_ssl_mongo_tools]
+
load('jstests/ssl/libs/ssl_helpers.js');
requireSSLProvider('openssl', function() {
var baseName = "jstests_ssl_ssl_cert_password";
@@ -41,10 +42,9 @@ requireSSLProvider('openssl', function() {
// Test that mongodump and mongorestore support ssl
c = md.getDB("dumprestore_ssl").getCollection("foo");
- assert.eq(0, c.find().itcount(), "dumprestore_ssl.foo collection is not initially empty");
+ assert.eq(0, c.count(), "dumprestore_ssl.foo collection is not initially empty");
c.save({a: 22});
- assert.eq(
- 1, c.find().itcount(), "failed to insert document into dumprestore_ssl.foo collection");
+ assert.eq(1, c.count(), "failed to insert document into dumprestore_ssl.foo collection");
exit_code = MongoRunner.runMongoTool("mongodump", {
out: external_scratch_dir,
@@ -58,7 +58,7 @@ requireSSLProvider('openssl', function() {
assert.eq(exit_code, 0, "Failed to start mongodump with ssl");
c.drop();
- assert.eq(0, c.find().itcount(), "dumprestore_ssl.foo collection is not empty after drop");
+ assert.eq(0, c.count(), "dumprestore_ssl.foo collection is not empty after drop");
exit_code = MongoRunner.runMongoTool("mongorestore", {
dir: external_scratch_dir,
@@ -75,17 +75,16 @@ requireSSLProvider('openssl', function() {
"no data after sleep. Expected a document after calling mongorestore");
assert.eq(
1,
- c.find().itcount(),
+ c.count(),
"did not find expected document in dumprestore_ssl.foo collection after mongorestore");
assert.eq(22, c.findOne().a, "did not find correct value in document after mongorestore");
// Test that mongoimport and mongoexport support ssl
var exportimport_ssl_dbname = "exportimport_ssl";
c = md.getDB(exportimport_ssl_dbname).getCollection("foo");
- assert.eq(0, c.find().itcount(), "exportimport_ssl.foo collection is not initially empty");
+ assert.eq(0, c.count(), "exportimport_ssl.foo collection is not initially empty");
c.save({a: 22});
- assert.eq(
- 1, c.find().itcount(), "failed to insert document into exportimport_ssl.foo collection");
+ assert.eq(1, c.count(), "failed to insert document into exportimport_ssl.foo collection");
var exportimport_file = "data.json";
@@ -103,7 +102,7 @@ requireSSLProvider('openssl', function() {
assert.eq(exit_code, 0, "Failed to start mongoexport with ssl");
c.drop();
- assert.eq(0, c.find().itcount(), "afterdrop", "-d", exportimport_ssl_dbname, "-c", "foo");
+ assert.eq(0, c.count(), "afterdrop", "-d", exportimport_ssl_dbname, "-c", "foo");
exit_code = MongoRunner.runMongoTool("mongoimport", {
file: external_scratch_dir + exportimport_file,
@@ -121,7 +120,7 @@ requireSSLProvider('openssl', function() {
assert.soon("c.findOne()",
"no data after sleep. Expected a document after calling mongoimport");
assert.eq(1,
- c.find().itcount(),
+ c.count(),
"did not find expected document in dumprestore_ssl.foo collection after mongoimport");
assert.eq(22, c.findOne().a, "did not find correct value in document after mongoimport");
diff --git a/jstests/ssl/upgrade_noauth_to_x509_ssl.js b/jstests/ssl/upgrade_noauth_to_x509_ssl.js
index fa4c58f66b1..ba1f53bdb8e 100644
--- a/jstests/ssl/upgrade_noauth_to_x509_ssl.js
+++ b/jstests/ssl/upgrade_noauth_to_x509_ssl.js
@@ -35,7 +35,7 @@ load('jstests/ssl/libs/ssl_helpers.js');
{createUser: 'root', pwd: 'root', roles: ['root'], writeConcern: {w: 3}}));
assert.writeOK(testDB.a.insert({a: 1, str: 'TESTTESTTEST'}));
- assert.eq(1, testDB.a.find().itcount(), 'Error interacting with replSet');
+ assert.eq(1, testDB.a.count(), 'Error interacting with replSet');
print('=== UPGRADE transition to x509/allowSSL -> transition to x509/preferSSL ===');
rst.nodes.forEach(function(node) {
@@ -44,7 +44,7 @@ load('jstests/ssl/libs/ssl_helpers.js');
rst.awaitSecondaryNodes();
testDB = rst.getPrimary().getDB(dbName);
assert.writeOK(testDB.a.insert({a: 1, str: 'TESTTESTTEST'}));
- assert.eq(2, testDB.a.find().itcount(), 'Error interacting with replSet');
+ assert.eq(2, testDB.a.count(), 'Error interacting with replSet');
print('=== UPGRADE transition to x509/preferSSL -> x509/requireSSL ===');
rst.upgradeSet(x509RequireSSL, 'root', 'root');
@@ -52,7 +52,7 @@ load('jstests/ssl/libs/ssl_helpers.js');
// upgradeSet leaves its connections logged in as root
testDB = rst.getPrimary().getDB(dbName);
assert.writeOK(testDB.a.insert({a: 1, str: 'TESTTESTTEST'}));
- assert.eq(3, testDB.a.find().itcount(), 'Error interacting with replSet');
+ assert.eq(3, testDB.a.count(), 'Error interacting with replSet');
rst.stopSet();
}());
diff --git a/jstests/ssl/upgrade_to_ssl.js b/jstests/ssl/upgrade_to_ssl.js
index 48453e27daa..d062e8407d5 100644
--- a/jstests/ssl/upgrade_to_ssl.js
+++ b/jstests/ssl/upgrade_to_ssl.js
@@ -23,14 +23,14 @@ rst.initiate();
var rstConn1 = rst.getPrimary();
rstConn1.getDB("test").a.insert({a: 1, str: "TESTTESTTEST"});
-assert.eq(1, rstConn1.getDB("test").a.find().itcount(), "Error interacting with replSet");
+assert.eq(1, rstConn1.getDB("test").a.count(), "Error interacting with replSet");
print("===== UPGRADE allowSSL -> preferSSL =====");
opts.sslMode = "preferSSL";
rst.upgradeSet(opts);
var rstConn2 = rst.getPrimary();
rstConn2.getDB("test").a.insert({a: 2, str: "CHECKCHECK"});
-assert.eq(2, rstConn2.getDB("test").a.find().itcount(), "Error interacting with replSet");
+assert.eq(2, rstConn2.getDB("test").a.count(), "Error interacting with replSet");
// Check that non-ssl connections can still be made
var canConnectNoSSL = runMongoProgram("mongo", "--port", rst.ports[0], "--eval", ";");
@@ -41,7 +41,7 @@ opts.sslMode = "requireSSL";
rst.upgradeSet(opts);
var rstConn3 = rst.getPrimary();
rstConn3.getDB("test").a.insert({a: 3, str: "GREENEGGSANDHAM"});
-assert.eq(3, rstConn3.getDB("test").a.find().itcount(), "Error interacting with replSet");
+assert.eq(3, rstConn3.getDB("test").a.count(), "Error interacting with replSet");
// Check that ssl connections can be made
var canConnectSSL = runMongoProgram(
diff --git a/jstests/ssl/upgrade_to_x509_ssl.js b/jstests/ssl/upgrade_to_x509_ssl.js
index 631f91dd1bd..51c5418713c 100644
--- a/jstests/ssl/upgrade_to_x509_ssl.js
+++ b/jstests/ssl/upgrade_to_x509_ssl.js
@@ -60,7 +60,7 @@ rstConn1.getDB("admin").createUser({user: "root", pwd: "pwd", roles: ["root"]},
rstConn1.getDB("admin").auth("root", "pwd");
rstConn1.getDB("test").a.insert({a: 1, str: "TESTTESTTEST"});
rstConn1.getDB("test").a.insert({a: 1, str: "WOOPWOOPWOOPWOOPWOOP"});
-assert.eq(2, rstConn1.getDB("test").a.find().itcount(), "Error interacting with replSet");
+assert.eq(2, rstConn1.getDB("test").a.count(), "Error interacting with replSet");
print("===== UPGRADE allowSSL,sendKeyfile -> preferSSL,sendX509 =====");
authAllNodes();
@@ -79,7 +79,7 @@ rst.upgradeSet({
authAllNodes();
var rstConn3 = rst.getPrimary();
rstConn3.getDB("test").a.insert({a: 3, str: "TESTTESTTEST"});
-assert.eq(3, rstConn3.getDB("test").a.find().itcount(), "Error interacting with replSet");
+assert.eq(3, rstConn3.getDB("test").a.count(), "Error interacting with replSet");
rst.awaitReplication();
// Test that a non-ssl connection can still be made
var canConnectNoSSL = runMongoProgram("mongo", "--port", rst.ports[0], "--eval", ";");
@@ -99,5 +99,5 @@ rst.upgradeSet({
authAllNodes();
var rstConn4 = rst.getPrimary();
rstConn4.getDB("test").a.insert({a: 4, str: "TESTTESTTEST"});
-assert.eq(4, rstConn4.getDB("test").a.find().itcount(), "Error interacting with replSet");
+assert.eq(4, rstConn4.getDB("test").a.count(), "Error interacting with replSet");
rst.stopSet();
diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript
index 17add6e4bf7..74c0191286c 100644
--- a/src/mongo/db/SConscript
+++ b/src/mongo/db/SConscript
@@ -482,6 +482,7 @@ env.Library(
'service_context.cpp',
'service_context_noop.cpp',
'service_context_registrar.cpp',
+ 'server_recovery.cpp',
'unclean_shutdown.cpp',
],
LIBDEPS=[
diff --git a/src/mongo/db/repl/replication_recovery.cpp b/src/mongo/db/repl/replication_recovery.cpp
index 16adfd39966..3ddf34a9e21 100644
--- a/src/mongo/db/repl/replication_recovery.cpp
+++ b/src/mongo/db/repl/replication_recovery.cpp
@@ -38,6 +38,7 @@
#include "mongo/db/repl/replication_consistency_markers_impl.h"
#include "mongo/db/repl/storage_interface.h"
#include "mongo/db/repl/sync_tail.h"
+#include "mongo/db/server_recovery.h"
#include "mongo/db/session.h"
#include "mongo/util/log.h"
@@ -55,6 +56,16 @@ void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx,
return; // Initial Sync will take over so no cleanup is needed.
}
+ const auto serviceCtx = getGlobalServiceContext();
+ inReplicationRecovery(serviceCtx) = true;
+ ON_BLOCK_EXIT([serviceCtx] {
+ invariant(
+ inReplicationRecovery(serviceCtx),
+ "replication recovery flag is unexpectedly unset when exiting recoverFromOplog()");
+ inReplicationRecovery(serviceCtx) = false;
+ sizeRecoveryState(serviceCtx).clearStateAfterRecovery();
+ });
+
const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx);
if (!truncateAfterPoint.isNull()) {
log() << "Removing unapplied entries starting at: " << truncateAfterPoint.toBSON();
diff --git a/src/mongo/db/server_recovery.cpp b/src/mongo/db/server_recovery.cpp
new file mode 100644
index 00000000000..4f4f46f0293
--- /dev/null
+++ b/src/mongo/db/server_recovery.cpp
@@ -0,0 +1,71 @@
+/**
+ * Copyright (C) 2018 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects
+ * for all of the code used other than as permitted herein. If you modify
+ * file(s) with this exception, you may extend this exception to your
+ * version of the file(s), but you are not obligated to do so. If you do not
+ * wish to do so, delete this exception statement from your version. If you
+ * delete this exception statement from all source files in the program,
+ * then also delete it in the license file.
+ */
+
+#include "mongo/platform/basic.h"
+
+#include "mongo/db/server_recovery.h"
+
+#include "mongo/db/namespace_string.h"
+
+namespace mongo {
+namespace {
+const auto getInReplicationRecovery = ServiceContext::declareDecoration<bool>();
+const auto getSizeRecoveryState = ServiceContext::declareDecoration<SizeRecoveryState>();
+} // namespace
+
+bool SizeRecoveryState::collectionNeedsSizeAdjustment(const std::string& ns) const {
+ if (!inReplicationRecovery(getGlobalServiceContext())) {
+ return true;
+ }
+
+ if (NamespaceString::oplog(ns)) {
+ return true;
+ }
+
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+ return _collectionsAlwaysNeedingSizeAdjustment.count(ns) > 0;
+}
+
+void SizeRecoveryState::markCollectionAsAlwaysNeedsSizeAdjustment(const std::string& ns) {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+ _collectionsAlwaysNeedingSizeAdjustment.insert(ns);
+}
+
+void SizeRecoveryState::clearStateAfterRecovery() {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+ _collectionsAlwaysNeedingSizeAdjustment.clear();
+}
+} // namespace mongo
+
+bool& mongo::inReplicationRecovery(ServiceContext* serviceCtx) {
+ return getInReplicationRecovery(serviceCtx);
+}
+
+mongo::SizeRecoveryState& mongo::sizeRecoveryState(ServiceContext* serviceCtx) {
+ return getSizeRecoveryState(serviceCtx);
+}
diff --git a/src/mongo/db/server_recovery.h b/src/mongo/db/server_recovery.h
new file mode 100644
index 00000000000..0fcd6ebc55d
--- /dev/null
+++ b/src/mongo/db/server_recovery.h
@@ -0,0 +1,90 @@
+/**
+ * Copyright (C) 2018 MongoDB Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License, version 3,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the GNU Affero General Public License in all respects
+ * for all of the code used other than as permitted herein. If you modify
+ * file(s) with this exception, you may extend this exception to your
+ * version of the file(s), but you are not obligated to do so. If you do not
+ * wish to do so, delete this exception statement from your version. If you
+ * delete this exception statement from all source files in the program,
+ * then also delete it in the license file.
+ */
+
+#pragma once
+
+#include <set>
+#include <string>
+
+#include "mongo/db/service_context.h"
+#include "mongo/stdx/mutex.h"
+
+namespace mongo {
+/**
+ * This class is for use with non-MMAPv1 storage engines that track record store sizes in catalog
+ * metadata.
+ *
+ * During normal server operation, we adjust the size metadata for all record stores. But when
+ * performing replication recovery, we avoid doing so, as we trust that the size metadata on disk is
+ * already correct with respect to the end state of recovery.
+ *
+ * However, there may be exceptions that require the server to adjust size metadata even during
+ * recovery. One such case is the oplog: during rollback, the oplog is truncated, and then recovery
+ * occurs using oplog entries after the common point from the sync source. The server will need to
+ * adjust the size metadata for the oplog namespace to ensure that the count of oplog entries is
+ * correct after rollback recovery.
+ *
+ * This class is responsible for keeping track of namespaces that require this special
+ * count adjustment.
+ */
+class SizeRecoveryState {
+public:
+ /**
+ * If replication recovery is ongoing, returns false unless 'ns' is the oplog namespace or has
+ * been specifically marked as requiring adjustment even during recovery.
+ *
+ * If the system is not currently undergoing replication recovery, always returns true.
+ */
+ bool collectionNeedsSizeAdjustment(const std::string& ns) const;
+
+ /**
+ * Mark 'ns' as always requiring size adjustment, even if replication recovery is ongoing.
+ */
+ void markCollectionAsAlwaysNeedsSizeAdjustment(const std::string& ns);
+
+ /**
+ * Clears all internal state. This method should be called when replication recovery ends.
+ */
+ void clearStateAfterRecovery();
+
+private:
+ mutable stdx::mutex _mutex;
+ std::set<std::string> _collectionsAlwaysNeedingSizeAdjustment;
+};
+
+/**
+ * Returns a mutable reference to the single SizeRecoveryState associated with 'serviceCtx'.
+ */
+SizeRecoveryState& sizeRecoveryState(ServiceContext* serviceCtx);
+
+/**
+ * Returns a mutable reference to a boolean decoration on 'serviceCtx', which indicates whether or
+ * not the server is currently undergoing replication recovery.
+ */
+bool& inReplicationRecovery(ServiceContext* serviceCtx);
+} // namespace mongo
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
index 6a1be102524..21b14a20734 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
@@ -30,6 +30,8 @@
*/
#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage
+#define LOG_FOR_RECOVERY(level) \
+ MONGO_LOG_COMPONENT(level, ::mongo::logger::LogComponent::kStorageRecovery)
#include "mongo/platform/basic.h"
@@ -45,6 +47,7 @@
#include "mongo/db/namespace_string.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/repl/repl_settings.h"
+#include "mongo/db/server_recovery.h"
#include "mongo/db/service_context.h"
#include "mongo/db/storage/oplog_hack.h"
#include "mongo/db/storage/wiredtiger/wiredtiger_customization_hooks.h"
@@ -654,6 +657,17 @@ WiredTigerRecordStore::WiredTigerRecordStore(WiredTigerKVEngine* kvEngine,
if (_isOplog) {
checkOplogFormatVersion(ctx, _uri);
}
+
+ // Most record stores will not have their size metadata adjusted during replication recovery.
+ // However, if this record store was created during the recovery process, we will need to keep
+ // track of size adjustments for any writes applied to it during recovery.
+ const auto serviceCtx = getGlobalServiceContext();
+ if (inReplicationRecovery(serviceCtx)) {
+ LOG_FOR_RECOVERY(2)
+ << "Marking newly-created record store as needing size adjustment during recovery. ns: "
+ << ns() << ", ident: " << _uri;
+ sizeRecoveryState(serviceCtx).markCollectionAsAlwaysNeedsSizeAdjustment(ns());
+ }
}
WiredTigerRecordStore::~WiredTigerRecordStore() {
@@ -703,8 +717,18 @@ void WiredTigerRecordStore::postConstructorInit(OperationContext* opCtx) {
} while ((record = cursor->next()));
}
} else {
+ // We found no records in this collection; however, there may actually be documents present
+ // if writes to this collection were not included in the stable checkpoint the last time
+ // this node shut down. We set the data size and the record count to zero, but will adjust
+ // these if writes are played during startup recovery.
+ LOG_FOR_RECOVERY(2) << "Record store was empty; setting count metadata to zero but marking "
+ "record store as needing size adjustment during recovery. ns: "
+ << ns() << ", ident: " << _uri;
+ sizeRecoveryState(getGlobalServiceContext())
+ .markCollectionAsAlwaysNeedsSizeAdjustment(ns());
_dataSize.store(0);
_numRecords.store(0);
+
// Need to start at 1 so we are always higher than RecordId::min()
_nextIdNum.store(1);
if (_sizeStorer)
@@ -1585,6 +1609,10 @@ private:
};
void WiredTigerRecordStore::_changeNumRecords(OperationContext* opCtx, int64_t diff) {
+ if (!sizeRecoveryState(getGlobalServiceContext()).collectionNeedsSizeAdjustment(ns())) {
+ return;
+ }
+
opCtx->recoveryUnit()->registerChange(new NumRecordsChange(this, diff));
if (_numRecords.fetchAndAdd(diff) < 0)
_numRecords.store(std::max(diff, int64_t(0)));
@@ -1604,6 +1632,10 @@ private:
};
void WiredTigerRecordStore::_increaseDataSize(OperationContext* opCtx, int64_t amount) {
+ if (!sizeRecoveryState(getGlobalServiceContext()).collectionNeedsSizeAdjustment(ns())) {
+ return;
+ }
+
if (opCtx)
opCtx->recoveryUnit()->registerChange(new DataSizeChange(this, amount));
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h
index 6ff78eae68a..d0dc5e6cfd2 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h
@@ -296,9 +296,27 @@ private:
RecordId _nextId();
void _setId(RecordId id);
bool cappedAndNeedDelete() const;
+ RecordData _getData(const WiredTigerCursor& cursor) const;
+
+ /**
+ * Adjusts the record count and data size metadata for this record store, respectively. These
+ * functions consult the SizeRecoveryState to determine whether or not to actually change the
+ * size metadata if the server is undergoing recovery.
+ *
+ * For most record stores, we will not update the size metadata during recovery, as we trust
+ * that the values in the SizeStorer are accurate with respect to the end state of recovery.
+ * However, there are two exceptions:
+ *
+ * 1. When a record store is created as part of the recovery process. The SizeStorer will have
+ * no information about that newly-created ident.
+ * 2. When a record store is created at startup but constains no records as of the stable
+ * checkpoint timestamp. In this scenario, we will assume that the record store has a size
+ * of zero and will discard all cached size metadata. This assumption is incorrect if there
+ * are pending writes to this ident as part of the recovery process, and so we must
+ * always adjust size metadata for these idents.
+ */
void _changeNumRecords(OperationContext* opCtx, int64_t diff);
void _increaseDataSize(OperationContext* opCtx, int64_t amount);
- RecordData _getData(const WiredTigerCursor& cursor) const;
const std::string _uri;