diff options
author | Kyle Suarez <kyle.suarez@mongodb.com> | 2018-04-07 10:52:37 -0400 |
---|---|---|
committer | Kyle Suarez <kyle.suarez@mongodb.com> | 2018-04-07 10:52:37 -0400 |
commit | fae36f1444627d28bd18e7395962078a729b940a (patch) | |
tree | f6a9a94ec8e528eef5c4bb2d404a5997e40c2966 | |
parent | 47e4b6b791cce13a36ea499a9311d54f100412ee (diff) | |
download | mongo-fae36f1444627d28bd18e7395962078a729b940a.tar.gz |
SERVER-33488 conditionally update WT size metadata during startup recovery
-rw-r--r-- | jstests/auth/upgrade_noauth_to_keyfile.js | 6 | ||||
-rw-r--r-- | jstests/replsets/recovery_after_clean_shutdown_but_not_all_writes_in_snapshot.js | 79 | ||||
-rw-r--r-- | jstests/ssl/initial_sync1_x509.js | 2 | ||||
-rw-r--r-- | jstests/ssl/ssl_cert_password.js | 19 | ||||
-rw-r--r-- | jstests/ssl/upgrade_noauth_to_x509_ssl.js | 6 | ||||
-rw-r--r-- | jstests/ssl/upgrade_to_ssl.js | 6 | ||||
-rw-r--r-- | jstests/ssl/upgrade_to_x509_ssl.js | 6 | ||||
-rw-r--r-- | src/mongo/db/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_recovery.cpp | 11 | ||||
-rw-r--r-- | src/mongo/db/server_recovery.cpp | 71 | ||||
-rw-r--r-- | src/mongo/db/server_recovery.h | 90 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp | 32 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h | 20 |
13 files changed, 325 insertions, 24 deletions
diff --git a/jstests/auth/upgrade_noauth_to_keyfile.js b/jstests/auth/upgrade_noauth_to_keyfile.js index 464199ab745..9bf2ec115e6 100644 --- a/jstests/auth/upgrade_noauth_to_keyfile.js +++ b/jstests/auth/upgrade_noauth_to_keyfile.js @@ -38,13 +38,13 @@ TestData.skipGossipingClusterTime = true; rstConn1.getDB('admin').createUser({user: 'root', pwd: 'root', roles: ['root']}); rstConn1.getDB('test').a.insert({a: 1, str: 'TESTTESTTEST'}); - assert.eq(1, rstConn1.getDB('test').a.find().itcount(), 'Error interacting with replSet'); + assert.eq(1, rstConn1.getDB('test').a.count(), 'Error interacting with replSet'); print('=== UPGRADE noauth -> transitionToAuth/keyFile ==='); rst.upgradeSet(transitionToAuthOptions); var rstConn2 = rst.getPrimary(); rstConn2.getDB('test').a.insert({a: 1, str: 'TESTTESTTEST'}); - assert.eq(2, rstConn2.getDB('test').a.find().itcount(), 'Error interacting with replSet'); + assert.eq(2, rstConn2.getDB('test').a.count(), 'Error interacting with replSet'); print('=== UPGRADE transitionToAuth/keyFile -> keyFile ==='); rst.upgradeSet(keyFileOptions, 'root', 'root'); @@ -52,7 +52,7 @@ TestData.skipGossipingClusterTime = true; // upgradeSet leaves its connections logged in as root var rstConn3 = rst.getPrimary(); rstConn3.getDB('test').a.insert({a: 1, str: 'TESTTESTTEST'}); - assert.eq(3, rstConn3.getDB('test').a.find().itcount(), 'Error interacting with replSet'); + assert.eq(3, rstConn3.getDB('test').a.count(), 'Error interacting with replSet'); rst.stopSet(); }()); diff --git a/jstests/replsets/recovery_after_clean_shutdown_but_not_all_writes_in_snapshot.js b/jstests/replsets/recovery_after_clean_shutdown_but_not_all_writes_in_snapshot.js new file mode 100644 index 00000000000..45a005e255e --- /dev/null +++ b/jstests/replsets/recovery_after_clean_shutdown_but_not_all_writes_in_snapshot.js @@ -0,0 +1,79 @@ +/** + * Tests that fast metadata counts are correct after replication recovery following a clean + * shutdown. + * + * @tags: [requires_persistence, requires_replication] + */ +(function() { + "use strict"; + + const rst = new ReplSetTest({ + name: "recoveryAfterCleanShutdown", + nodes: 2, + nodeOptions: + {setParameter: {logComponentVerbosity: tojsononeline({storage: {recovery: 2}})}} + }); + const nodes = rst.startSet(); + rst.initiate(); + + const dbName = "recovery_clean_shutdown"; + let primaryDB = rst.getPrimary().getDB(dbName); + const wMajority = {writeConcern: {w: "majority", wtimeout: ReplSetTest.kDefaultTimeoutMS}}; + + // Create a collection that will have all of its writes in the stable checkpoint. + const collAllStableWrites = "allWritesInStableCheckpoint"; + assert.commandWorked(primaryDB[collAllStableWrites].insert({_id: "dan"}, wMajority)); + assert.commandWorked(primaryDB[collAllStableWrites].insert({_id: "judah"}, wMajority)); + assert.commandWorked(primaryDB[collAllStableWrites].insert({_id: "vessy"}, wMajority)); + assert.commandWorked(primaryDB[collAllStableWrites].insert({_id: "kyle"}, wMajority)); + + // Set up a collection with some writes that make it into the stable checkpoint. + const collSomeStableWrites = "someWritesInStableCheckpoint"; + assert.commandWorked(primaryDB[collSomeStableWrites].insert({_id: "erjon"}, wMajority)); + assert.commandWorked(primaryDB[collSomeStableWrites].insert({_id: "jungsoo"}, wMajority)); + + // Set up a collection whose creation is in the stable checkpoint, but will have no stable + // writes. + const collNoStableWrites = "noWritesInStableCheckpoint"; + assert.commandWorked(primaryDB[collNoStableWrites].runCommand("create", wMajority)); + + // Wait for all oplog entries to enter the stable checkpoint on all secondaries. + rst.awaitLastOpCommitted(); + + // Disable snapshotting on all members of the replica set so that further operations do not + // enter the majority snapshot. + nodes.forEach(node => assert.commandWorked(node.adminCommand( + {configureFailPoint: "disableSnapshotting", mode: "alwaysOn"}))); + const w1 = {writeConcern: {w: 1, wtimeout: ReplSetTest.kDefaultTimeoutMS}}; + + // Set up a collection whose creation is not in the stable checkpoint. + const collNoStableCreation = "creationNotInStableCheckpoint"; + assert.commandWorked(primaryDB[collNoStableCreation].runCommand("create", w1)); + + // Perform writes on collections that replicate to each node but do not enter the majority + // snapshot. These commands will be replayed during replication recovery during restart. + [collSomeStableWrites, collNoStableWrites, collNoStableCreation].forEach( + coll => assert.commandWorked( + primaryDB[coll].insert({_id: "insertedAfterSnapshottingDisabled"}, w1))); + rst.awaitReplication(); + + jsTestLog("Checking collection counts after snapshotting has been disabled"); + rst.checkCollectionCounts(); + + // Perform a clean shutdown and restart. Note that the 'disableSnapshotting' failpoint will be + // unset on each node following the restart. + nodes.forEach(node => rst.restart(node)); + rst.awaitNodesAgreeOnPrimary(); + primaryDB = rst.getPrimary().getDB(dbName); + + // Perform a majority write to ensure that both nodes agree on the majority commit point. + const collCreatedAfterRestart = "createdAfterRestart"; + assert.commandWorked( + primaryDB[collCreatedAfterRestart].insert({_id: "insertedAfterRestart", wMajority})); + + // Fast metadata count should be correct after restart in the face of a clean shutdown. + jsTestLog("Checking collection counts after clean restart of all nodes"); + rst.checkCollectionCounts(); + + rst.stopSet(); +}()); diff --git a/jstests/ssl/initial_sync1_x509.js b/jstests/ssl/initial_sync1_x509.js index 9cc2acdb295..c49ab37df5a 100644 --- a/jstests/ssl/initial_sync1_x509.js +++ b/jstests/ssl/initial_sync1_x509.js @@ -54,7 +54,7 @@ function runInitialSyncTest() { bulk.insert({date: new Date(), x: i, str: "all the talk on the market"}); } assert.writeOK(bulk.execute()); - print("total in foo: " + foo.bar.find().itcount()); + print("total in foo: " + foo.bar.count()); print("4. Make sure synced"); replTest.awaitReplication(); diff --git a/jstests/ssl/ssl_cert_password.js b/jstests/ssl/ssl_cert_password.js index 8c555fd822b..356c0603c84 100644 --- a/jstests/ssl/ssl_cert_password.js +++ b/jstests/ssl/ssl_cert_password.js @@ -4,6 +4,7 @@ // does not return error statuses to indicate an error. // This test requires ssl support in mongo-tools // @tags: [requires_ssl_mongo_tools] + load('jstests/ssl/libs/ssl_helpers.js'); requireSSLProvider('openssl', function() { var baseName = "jstests_ssl_ssl_cert_password"; @@ -41,10 +42,9 @@ requireSSLProvider('openssl', function() { // Test that mongodump and mongorestore support ssl c = md.getDB("dumprestore_ssl").getCollection("foo"); - assert.eq(0, c.find().itcount(), "dumprestore_ssl.foo collection is not initially empty"); + assert.eq(0, c.count(), "dumprestore_ssl.foo collection is not initially empty"); c.save({a: 22}); - assert.eq( - 1, c.find().itcount(), "failed to insert document into dumprestore_ssl.foo collection"); + assert.eq(1, c.count(), "failed to insert document into dumprestore_ssl.foo collection"); exit_code = MongoRunner.runMongoTool("mongodump", { out: external_scratch_dir, @@ -58,7 +58,7 @@ requireSSLProvider('openssl', function() { assert.eq(exit_code, 0, "Failed to start mongodump with ssl"); c.drop(); - assert.eq(0, c.find().itcount(), "dumprestore_ssl.foo collection is not empty after drop"); + assert.eq(0, c.count(), "dumprestore_ssl.foo collection is not empty after drop"); exit_code = MongoRunner.runMongoTool("mongorestore", { dir: external_scratch_dir, @@ -75,17 +75,16 @@ requireSSLProvider('openssl', function() { "no data after sleep. Expected a document after calling mongorestore"); assert.eq( 1, - c.find().itcount(), + c.count(), "did not find expected document in dumprestore_ssl.foo collection after mongorestore"); assert.eq(22, c.findOne().a, "did not find correct value in document after mongorestore"); // Test that mongoimport and mongoexport support ssl var exportimport_ssl_dbname = "exportimport_ssl"; c = md.getDB(exportimport_ssl_dbname).getCollection("foo"); - assert.eq(0, c.find().itcount(), "exportimport_ssl.foo collection is not initially empty"); + assert.eq(0, c.count(), "exportimport_ssl.foo collection is not initially empty"); c.save({a: 22}); - assert.eq( - 1, c.find().itcount(), "failed to insert document into exportimport_ssl.foo collection"); + assert.eq(1, c.count(), "failed to insert document into exportimport_ssl.foo collection"); var exportimport_file = "data.json"; @@ -103,7 +102,7 @@ requireSSLProvider('openssl', function() { assert.eq(exit_code, 0, "Failed to start mongoexport with ssl"); c.drop(); - assert.eq(0, c.find().itcount(), "afterdrop", "-d", exportimport_ssl_dbname, "-c", "foo"); + assert.eq(0, c.count(), "afterdrop", "-d", exportimport_ssl_dbname, "-c", "foo"); exit_code = MongoRunner.runMongoTool("mongoimport", { file: external_scratch_dir + exportimport_file, @@ -121,7 +120,7 @@ requireSSLProvider('openssl', function() { assert.soon("c.findOne()", "no data after sleep. Expected a document after calling mongoimport"); assert.eq(1, - c.find().itcount(), + c.count(), "did not find expected document in dumprestore_ssl.foo collection after mongoimport"); assert.eq(22, c.findOne().a, "did not find correct value in document after mongoimport"); diff --git a/jstests/ssl/upgrade_noauth_to_x509_ssl.js b/jstests/ssl/upgrade_noauth_to_x509_ssl.js index fa4c58f66b1..ba1f53bdb8e 100644 --- a/jstests/ssl/upgrade_noauth_to_x509_ssl.js +++ b/jstests/ssl/upgrade_noauth_to_x509_ssl.js @@ -35,7 +35,7 @@ load('jstests/ssl/libs/ssl_helpers.js'); {createUser: 'root', pwd: 'root', roles: ['root'], writeConcern: {w: 3}})); assert.writeOK(testDB.a.insert({a: 1, str: 'TESTTESTTEST'})); - assert.eq(1, testDB.a.find().itcount(), 'Error interacting with replSet'); + assert.eq(1, testDB.a.count(), 'Error interacting with replSet'); print('=== UPGRADE transition to x509/allowSSL -> transition to x509/preferSSL ==='); rst.nodes.forEach(function(node) { @@ -44,7 +44,7 @@ load('jstests/ssl/libs/ssl_helpers.js'); rst.awaitSecondaryNodes(); testDB = rst.getPrimary().getDB(dbName); assert.writeOK(testDB.a.insert({a: 1, str: 'TESTTESTTEST'})); - assert.eq(2, testDB.a.find().itcount(), 'Error interacting with replSet'); + assert.eq(2, testDB.a.count(), 'Error interacting with replSet'); print('=== UPGRADE transition to x509/preferSSL -> x509/requireSSL ==='); rst.upgradeSet(x509RequireSSL, 'root', 'root'); @@ -52,7 +52,7 @@ load('jstests/ssl/libs/ssl_helpers.js'); // upgradeSet leaves its connections logged in as root testDB = rst.getPrimary().getDB(dbName); assert.writeOK(testDB.a.insert({a: 1, str: 'TESTTESTTEST'})); - assert.eq(3, testDB.a.find().itcount(), 'Error interacting with replSet'); + assert.eq(3, testDB.a.count(), 'Error interacting with replSet'); rst.stopSet(); }()); diff --git a/jstests/ssl/upgrade_to_ssl.js b/jstests/ssl/upgrade_to_ssl.js index 48453e27daa..d062e8407d5 100644 --- a/jstests/ssl/upgrade_to_ssl.js +++ b/jstests/ssl/upgrade_to_ssl.js @@ -23,14 +23,14 @@ rst.initiate(); var rstConn1 = rst.getPrimary(); rstConn1.getDB("test").a.insert({a: 1, str: "TESTTESTTEST"}); -assert.eq(1, rstConn1.getDB("test").a.find().itcount(), "Error interacting with replSet"); +assert.eq(1, rstConn1.getDB("test").a.count(), "Error interacting with replSet"); print("===== UPGRADE allowSSL -> preferSSL ====="); opts.sslMode = "preferSSL"; rst.upgradeSet(opts); var rstConn2 = rst.getPrimary(); rstConn2.getDB("test").a.insert({a: 2, str: "CHECKCHECK"}); -assert.eq(2, rstConn2.getDB("test").a.find().itcount(), "Error interacting with replSet"); +assert.eq(2, rstConn2.getDB("test").a.count(), "Error interacting with replSet"); // Check that non-ssl connections can still be made var canConnectNoSSL = runMongoProgram("mongo", "--port", rst.ports[0], "--eval", ";"); @@ -41,7 +41,7 @@ opts.sslMode = "requireSSL"; rst.upgradeSet(opts); var rstConn3 = rst.getPrimary(); rstConn3.getDB("test").a.insert({a: 3, str: "GREENEGGSANDHAM"}); -assert.eq(3, rstConn3.getDB("test").a.find().itcount(), "Error interacting with replSet"); +assert.eq(3, rstConn3.getDB("test").a.count(), "Error interacting with replSet"); // Check that ssl connections can be made var canConnectSSL = runMongoProgram( diff --git a/jstests/ssl/upgrade_to_x509_ssl.js b/jstests/ssl/upgrade_to_x509_ssl.js index 631f91dd1bd..51c5418713c 100644 --- a/jstests/ssl/upgrade_to_x509_ssl.js +++ b/jstests/ssl/upgrade_to_x509_ssl.js @@ -60,7 +60,7 @@ rstConn1.getDB("admin").createUser({user: "root", pwd: "pwd", roles: ["root"]}, rstConn1.getDB("admin").auth("root", "pwd"); rstConn1.getDB("test").a.insert({a: 1, str: "TESTTESTTEST"}); rstConn1.getDB("test").a.insert({a: 1, str: "WOOPWOOPWOOPWOOPWOOP"}); -assert.eq(2, rstConn1.getDB("test").a.find().itcount(), "Error interacting with replSet"); +assert.eq(2, rstConn1.getDB("test").a.count(), "Error interacting with replSet"); print("===== UPGRADE allowSSL,sendKeyfile -> preferSSL,sendX509 ====="); authAllNodes(); @@ -79,7 +79,7 @@ rst.upgradeSet({ authAllNodes(); var rstConn3 = rst.getPrimary(); rstConn3.getDB("test").a.insert({a: 3, str: "TESTTESTTEST"}); -assert.eq(3, rstConn3.getDB("test").a.find().itcount(), "Error interacting with replSet"); +assert.eq(3, rstConn3.getDB("test").a.count(), "Error interacting with replSet"); rst.awaitReplication(); // Test that a non-ssl connection can still be made var canConnectNoSSL = runMongoProgram("mongo", "--port", rst.ports[0], "--eval", ";"); @@ -99,5 +99,5 @@ rst.upgradeSet({ authAllNodes(); var rstConn4 = rst.getPrimary(); rstConn4.getDB("test").a.insert({a: 4, str: "TESTTESTTEST"}); -assert.eq(4, rstConn4.getDB("test").a.find().itcount(), "Error interacting with replSet"); +assert.eq(4, rstConn4.getDB("test").a.count(), "Error interacting with replSet"); rst.stopSet(); diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript index 17add6e4bf7..74c0191286c 100644 --- a/src/mongo/db/SConscript +++ b/src/mongo/db/SConscript @@ -482,6 +482,7 @@ env.Library( 'service_context.cpp', 'service_context_noop.cpp', 'service_context_registrar.cpp', + 'server_recovery.cpp', 'unclean_shutdown.cpp', ], LIBDEPS=[ diff --git a/src/mongo/db/repl/replication_recovery.cpp b/src/mongo/db/repl/replication_recovery.cpp index 16adfd39966..3ddf34a9e21 100644 --- a/src/mongo/db/repl/replication_recovery.cpp +++ b/src/mongo/db/repl/replication_recovery.cpp @@ -38,6 +38,7 @@ #include "mongo/db/repl/replication_consistency_markers_impl.h" #include "mongo/db/repl/storage_interface.h" #include "mongo/db/repl/sync_tail.h" +#include "mongo/db/server_recovery.h" #include "mongo/db/session.h" #include "mongo/util/log.h" @@ -55,6 +56,16 @@ void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx, return; // Initial Sync will take over so no cleanup is needed. } + const auto serviceCtx = getGlobalServiceContext(); + inReplicationRecovery(serviceCtx) = true; + ON_BLOCK_EXIT([serviceCtx] { + invariant( + inReplicationRecovery(serviceCtx), + "replication recovery flag is unexpectedly unset when exiting recoverFromOplog()"); + inReplicationRecovery(serviceCtx) = false; + sizeRecoveryState(serviceCtx).clearStateAfterRecovery(); + }); + const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx); if (!truncateAfterPoint.isNull()) { log() << "Removing unapplied entries starting at: " << truncateAfterPoint.toBSON(); diff --git a/src/mongo/db/server_recovery.cpp b/src/mongo/db/server_recovery.cpp new file mode 100644 index 00000000000..4f4f46f0293 --- /dev/null +++ b/src/mongo/db/server_recovery.cpp @@ -0,0 +1,71 @@ +/** + * Copyright (C) 2018 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects + * for all of the code used other than as permitted herein. If you modify + * file(s) with this exception, you may extend this exception to your + * version of the file(s), but you are not obligated to do so. If you do not + * wish to do so, delete this exception statement from your version. If you + * delete this exception statement from all source files in the program, + * then also delete it in the license file. + */ + +#include "mongo/platform/basic.h" + +#include "mongo/db/server_recovery.h" + +#include "mongo/db/namespace_string.h" + +namespace mongo { +namespace { +const auto getInReplicationRecovery = ServiceContext::declareDecoration<bool>(); +const auto getSizeRecoveryState = ServiceContext::declareDecoration<SizeRecoveryState>(); +} // namespace + +bool SizeRecoveryState::collectionNeedsSizeAdjustment(const std::string& ns) const { + if (!inReplicationRecovery(getGlobalServiceContext())) { + return true; + } + + if (NamespaceString::oplog(ns)) { + return true; + } + + stdx::lock_guard<stdx::mutex> lock(_mutex); + return _collectionsAlwaysNeedingSizeAdjustment.count(ns) > 0; +} + +void SizeRecoveryState::markCollectionAsAlwaysNeedsSizeAdjustment(const std::string& ns) { + stdx::lock_guard<stdx::mutex> lock(_mutex); + _collectionsAlwaysNeedingSizeAdjustment.insert(ns); +} + +void SizeRecoveryState::clearStateAfterRecovery() { + stdx::lock_guard<stdx::mutex> lock(_mutex); + _collectionsAlwaysNeedingSizeAdjustment.clear(); +} +} // namespace mongo + +bool& mongo::inReplicationRecovery(ServiceContext* serviceCtx) { + return getInReplicationRecovery(serviceCtx); +} + +mongo::SizeRecoveryState& mongo::sizeRecoveryState(ServiceContext* serviceCtx) { + return getSizeRecoveryState(serviceCtx); +} diff --git a/src/mongo/db/server_recovery.h b/src/mongo/db/server_recovery.h new file mode 100644 index 00000000000..0fcd6ebc55d --- /dev/null +++ b/src/mongo/db/server_recovery.h @@ -0,0 +1,90 @@ +/** + * Copyright (C) 2018 MongoDB Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the GNU Affero General Public License in all respects + * for all of the code used other than as permitted herein. If you modify + * file(s) with this exception, you may extend this exception to your + * version of the file(s), but you are not obligated to do so. If you do not + * wish to do so, delete this exception statement from your version. If you + * delete this exception statement from all source files in the program, + * then also delete it in the license file. + */ + +#pragma once + +#include <set> +#include <string> + +#include "mongo/db/service_context.h" +#include "mongo/stdx/mutex.h" + +namespace mongo { +/** + * This class is for use with non-MMAPv1 storage engines that track record store sizes in catalog + * metadata. + * + * During normal server operation, we adjust the size metadata for all record stores. But when + * performing replication recovery, we avoid doing so, as we trust that the size metadata on disk is + * already correct with respect to the end state of recovery. + * + * However, there may be exceptions that require the server to adjust size metadata even during + * recovery. One such case is the oplog: during rollback, the oplog is truncated, and then recovery + * occurs using oplog entries after the common point from the sync source. The server will need to + * adjust the size metadata for the oplog namespace to ensure that the count of oplog entries is + * correct after rollback recovery. + * + * This class is responsible for keeping track of namespaces that require this special + * count adjustment. + */ +class SizeRecoveryState { +public: + /** + * If replication recovery is ongoing, returns false unless 'ns' is the oplog namespace or has + * been specifically marked as requiring adjustment even during recovery. + * + * If the system is not currently undergoing replication recovery, always returns true. + */ + bool collectionNeedsSizeAdjustment(const std::string& ns) const; + + /** + * Mark 'ns' as always requiring size adjustment, even if replication recovery is ongoing. + */ + void markCollectionAsAlwaysNeedsSizeAdjustment(const std::string& ns); + + /** + * Clears all internal state. This method should be called when replication recovery ends. + */ + void clearStateAfterRecovery(); + +private: + mutable stdx::mutex _mutex; + std::set<std::string> _collectionsAlwaysNeedingSizeAdjustment; +}; + +/** + * Returns a mutable reference to the single SizeRecoveryState associated with 'serviceCtx'. + */ +SizeRecoveryState& sizeRecoveryState(ServiceContext* serviceCtx); + +/** + * Returns a mutable reference to a boolean decoration on 'serviceCtx', which indicates whether or + * not the server is currently undergoing replication recovery. + */ +bool& inReplicationRecovery(ServiceContext* serviceCtx); +} // namespace mongo diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp index 6a1be102524..21b14a20734 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp @@ -30,6 +30,8 @@ */ #define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage +#define LOG_FOR_RECOVERY(level) \ + MONGO_LOG_COMPONENT(level, ::mongo::logger::LogComponent::kStorageRecovery) #include "mongo/platform/basic.h" @@ -45,6 +47,7 @@ #include "mongo/db/namespace_string.h" #include "mongo/db/operation_context.h" #include "mongo/db/repl/repl_settings.h" +#include "mongo/db/server_recovery.h" #include "mongo/db/service_context.h" #include "mongo/db/storage/oplog_hack.h" #include "mongo/db/storage/wiredtiger/wiredtiger_customization_hooks.h" @@ -654,6 +657,17 @@ WiredTigerRecordStore::WiredTigerRecordStore(WiredTigerKVEngine* kvEngine, if (_isOplog) { checkOplogFormatVersion(ctx, _uri); } + + // Most record stores will not have their size metadata adjusted during replication recovery. + // However, if this record store was created during the recovery process, we will need to keep + // track of size adjustments for any writes applied to it during recovery. + const auto serviceCtx = getGlobalServiceContext(); + if (inReplicationRecovery(serviceCtx)) { + LOG_FOR_RECOVERY(2) + << "Marking newly-created record store as needing size adjustment during recovery. ns: " + << ns() << ", ident: " << _uri; + sizeRecoveryState(serviceCtx).markCollectionAsAlwaysNeedsSizeAdjustment(ns()); + } } WiredTigerRecordStore::~WiredTigerRecordStore() { @@ -703,8 +717,18 @@ void WiredTigerRecordStore::postConstructorInit(OperationContext* opCtx) { } while ((record = cursor->next())); } } else { + // We found no records in this collection; however, there may actually be documents present + // if writes to this collection were not included in the stable checkpoint the last time + // this node shut down. We set the data size and the record count to zero, but will adjust + // these if writes are played during startup recovery. + LOG_FOR_RECOVERY(2) << "Record store was empty; setting count metadata to zero but marking " + "record store as needing size adjustment during recovery. ns: " + << ns() << ", ident: " << _uri; + sizeRecoveryState(getGlobalServiceContext()) + .markCollectionAsAlwaysNeedsSizeAdjustment(ns()); _dataSize.store(0); _numRecords.store(0); + // Need to start at 1 so we are always higher than RecordId::min() _nextIdNum.store(1); if (_sizeStorer) @@ -1585,6 +1609,10 @@ private: }; void WiredTigerRecordStore::_changeNumRecords(OperationContext* opCtx, int64_t diff) { + if (!sizeRecoveryState(getGlobalServiceContext()).collectionNeedsSizeAdjustment(ns())) { + return; + } + opCtx->recoveryUnit()->registerChange(new NumRecordsChange(this, diff)); if (_numRecords.fetchAndAdd(diff) < 0) _numRecords.store(std::max(diff, int64_t(0))); @@ -1604,6 +1632,10 @@ private: }; void WiredTigerRecordStore::_increaseDataSize(OperationContext* opCtx, int64_t amount) { + if (!sizeRecoveryState(getGlobalServiceContext()).collectionNeedsSizeAdjustment(ns())) { + return; + } + if (opCtx) opCtx->recoveryUnit()->registerChange(new DataSizeChange(this, amount)); diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h index 6ff78eae68a..d0dc5e6cfd2 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.h @@ -296,9 +296,27 @@ private: RecordId _nextId(); void _setId(RecordId id); bool cappedAndNeedDelete() const; + RecordData _getData(const WiredTigerCursor& cursor) const; + + /** + * Adjusts the record count and data size metadata for this record store, respectively. These + * functions consult the SizeRecoveryState to determine whether or not to actually change the + * size metadata if the server is undergoing recovery. + * + * For most record stores, we will not update the size metadata during recovery, as we trust + * that the values in the SizeStorer are accurate with respect to the end state of recovery. + * However, there are two exceptions: + * + * 1. When a record store is created as part of the recovery process. The SizeStorer will have + * no information about that newly-created ident. + * 2. When a record store is created at startup but constains no records as of the stable + * checkpoint timestamp. In this scenario, we will assume that the record store has a size + * of zero and will discard all cached size metadata. This assumption is incorrect if there + * are pending writes to this ident as part of the recovery process, and so we must + * always adjust size metadata for these idents. + */ void _changeNumRecords(OperationContext* opCtx, int64_t diff); void _increaseDataSize(OperationContext* opCtx, int64_t amount); - RecordData _getData(const WiredTigerCursor& cursor) const; const std::string _uri; |