summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLouis Williams <louis.williams@mongodb.com>2018-09-17 12:19:59 -0400
committerLouis Williams <louis.williams@mongodb.com>2018-09-18 09:56:04 -0400
commit3acf1742d7bcec78997614edb1e8ef26ccf2331f (patch)
treecece832626c42671aee6f15343829b57b8be7adb
parent185e2c6c38e959bbad40b624c4afeef5743f9c03 (diff)
downloadmongo-3acf1742d7bcec78997614edb1e8ef26ccf2331f.tar.gz
SERVER-35629 Use WiredTiger salvage API to repair corrupt metadata
-rw-r--r--jstests/disk/wt_corrupt_file_errors.js2
-rw-r--r--jstests/disk/wt_repair_corrupt_metadata.js81
-rw-r--r--src/mongo/db/storage/wiredtiger/SConscript10
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp103
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h11
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp2
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp13
7 files changed, 175 insertions, 47 deletions
diff --git a/jstests/disk/wt_corrupt_file_errors.js b/jstests/disk/wt_corrupt_file_errors.js
index f6e7204ccbf..5f829e5de8f 100644
--- a/jstests/disk/wt_corrupt_file_errors.js
+++ b/jstests/disk/wt_corrupt_file_errors.js
@@ -47,7 +47,7 @@
const WiredTigerWTFile = dbpath + "WiredTiger.wt";
jsTestLog("corrupting WiredTiger.wt");
corruptFile(WiredTigerWTFile);
- }, "Fatal Assertion 28595");
+ }, "Fatal Assertion 50944");
/**
* Test 4. Corrupt an index file.
diff --git a/jstests/disk/wt_repair_corrupt_metadata.js b/jstests/disk/wt_repair_corrupt_metadata.js
new file mode 100644
index 00000000000..16c12ac9bd7
--- /dev/null
+++ b/jstests/disk/wt_repair_corrupt_metadata.js
@@ -0,0 +1,81 @@
+/**
+ * Tests that --repair on WiredTiger correctly and gracefully handles corrupt metadata files.
+ *
+ * @tags: [requires_wiredtiger,requires_journaling]
+ */
+
+(function() {
+
+ load('jstests/disk/libs/wt_file_helper.js');
+
+ const baseName = "wt_repair_corrupt_metadata";
+ const collName = "test";
+ const dbpath = MongoRunner.dataPath + baseName + "/";
+
+ /**
+ * This test runs repair using a version of the WiredTiger.turtle file that has checkpoint
+ * information before the collection was created. The turtle file contains checkpoint
+ * information about the WiredTiger.wt file, so if these two files become out of sync,
+ * WiredTiger will have to attempt a salvage operation on the .wt file and rebuild the .turtle
+ * file.
+ *
+ * The expectation is that the metadata salvage will be successful, and that the collection will
+ * be recreated with all of its data.
+ */
+ let runTest = function(mongodOptions) {
+ // Unfortunately using --nojournal triggers a WT_PANIC and aborts in debug builds, which the
+ // following test case can exercise.
+ // TODO: This return can be removed once WT-4310 is completed.
+ if (db.adminCommand('buildInfo').debug && mongodOptions.hasOwnProperty('nojournal')) {
+ jsTestLog(
+ "Skipping test case because this is a debug build and --nojournal was provided.");
+ return;
+ }
+
+ resetDbpath(dbpath);
+ jsTestLog("Running test with args: " + tojson(mongodOptions));
+
+ const turtleFile = dbpath + "WiredTiger.turtle";
+ const turtleFileWithoutCollection = dbpath + "WiredTiger.turtle.1";
+
+ let mongod = startMongodOnExistingPath(dbpath, mongodOptions);
+
+ // Force a checkpoint and make a copy of the turtle file.
+ assert.commandWorked(mongod.getDB(baseName).adminCommand({fsync: 1}));
+ jsTestLog("Making copy of metadata file before creating the collection: " +
+ turtleFileWithoutCollection);
+ copyFile(turtleFile, turtleFileWithoutCollection);
+
+ let testColl = mongod.getDB(baseName)[collName];
+ assert.commandWorked(testColl.insert({a: 1}));
+
+ // Force another checkpoint before a clean shutdown.
+ assert.commandWorked(mongod.getDB(baseName).adminCommand({fsync: 1}));
+ MongoRunner.stopMongod(mongod);
+
+ // Guarantee the turtle files changed between checkpoints.
+ assert.neq(md5sumFile(turtleFileWithoutCollection), md5sumFile(turtleFile));
+
+ jsTestLog("Replacing metadata file with a version before the collection existed.");
+ removeFile(turtleFile);
+ copyFile(turtleFileWithoutCollection, turtleFile);
+
+ assertRepairSucceeds(dbpath, mongod.port, mongodOptions);
+
+ mongod = startMongodOnExistingPath(dbpath, mongodOptions);
+ testColl = mongod.getDB(baseName)[collName];
+
+ // The collection exists depite using an older turtle file because salvage is able to find
+ // the table in the WiredTiger.wt file.
+ assert(testColl.exists());
+ // We can assert that the data exists because the salvage only took place on the metadata,
+ // not the data.
+ assert.eq(testColl.find({}).itcount(), 1);
+ MongoRunner.stopMongod(mongod);
+ };
+
+ // Repair may behave differently with journaling enabled or disabled, but the end result should
+ // be the same.
+ runTest({journal: ""});
+ runTest({nojournal: ""});
+})();
diff --git a/src/mongo/db/storage/wiredtiger/SConscript b/src/mongo/db/storage/wiredtiger/SConscript
index 62f80ce6869..295dfd6ff36 100644
--- a/src/mongo/db/storage/wiredtiger/SConscript
+++ b/src/mongo/db/storage/wiredtiger/SConscript
@@ -83,6 +83,7 @@ if wiredtiger:
LIBDEPS_PRIVATE= [
'$BUILD_DIR/mongo/db/snapshot_window_options',
'$BUILD_DIR/mongo/util/options_parser/options_parser',
+ '$BUILD_DIR/mongo/db/storage/storage_repair_observer',
],
)
@@ -147,6 +148,7 @@ if wiredtiger:
'$BUILD_DIR/mongo/util/clock_source_mock',
],
LIBDEPS_PRIVATE=[
+ '$BUILD_DIR/mongo/db/auth/authmocks',
'$BUILD_DIR/mongo/db/repl/repl_coordinator_interface',
'$BUILD_DIR/mongo/db/repl/replmocks',
],
@@ -191,6 +193,7 @@ if wiredtiger:
'additional_wiredtiger_record_store_tests',
],
LIBDEPS_PRIVATE=[
+ '$BUILD_DIR/mongo/db/auth/authmocks',
'$BUILD_DIR/mongo/db/repl/repl_coordinator_interface',
'$BUILD_DIR/mongo/db/repl/replmocks',
],
@@ -205,6 +208,7 @@ if wiredtiger:
'additional_wiredtiger_record_store_tests',
],
LIBDEPS_PRIVATE=[
+ '$BUILD_DIR/mongo/db/auth/authmocks',
'$BUILD_DIR/mongo/db/repl/repl_coordinator_interface',
'$BUILD_DIR/mongo/db/repl/replmocks',
],
@@ -218,6 +222,9 @@ if wiredtiger:
LIBDEPS=[
'additional_wiredtiger_index_tests',
],
+ LIBDEPS_PRIVATE=[
+ '$BUILD_DIR/mongo/db/auth/authmocks',
+ ]
)
wtEnv.CppUnitTest(
@@ -228,6 +235,9 @@ if wiredtiger:
LIBDEPS=[
'additional_wiredtiger_index_tests',
],
+ LIBDEPS_PRIVATE=[
+ '$BUILD_DIR/mongo/db/auth/authmocks',
+ ]
)
wtEnv.CppUnitTest(
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
index 79326f3f1cc..eb41d13245f 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
@@ -71,6 +71,7 @@
#include "mongo/db/storage/journal_listener.h"
#include "mongo/db/storage/storage_file_util.h"
#include "mongo/db/storage/storage_options.h"
+#include "mongo/db/storage/storage_repair_observer.h"
#include "mongo/db/storage/wiredtiger/wiredtiger_customization_hooks.h"
#include "mongo/db/storage/wiredtiger/wiredtiger_extensions.h"
#include "mongo/db/storage/wiredtiger/wiredtiger_global_options.h"
@@ -159,48 +160,6 @@ std::string WiredTigerFileVersion::getDowngradeString() {
return "compatibility=(release=3.1)";
}
-namespace {
-void openWiredTiger(const std::string& path,
- WT_EVENT_HANDLER* eventHandler,
- const std::string& wtOpenConfig,
- WT_CONNECTION** connOut,
- WiredTigerFileVersion* fileVersionOut) {
- std::string configStr = wtOpenConfig + ",compatibility=(require_min=\"3.1.0\")";
- int ret = wiredtiger_open(path.c_str(), eventHandler, configStr.c_str(), connOut);
- if (!ret) {
- *fileVersionOut = {WiredTigerFileVersion::StartupVersion::IS_40};
- return;
- }
-
- // Arbiters do not replicate the FCV document. Due to arbiter FCV semantics on 4.0, shutting
- // down a 4.0 arbiter may either downgrade the data files to WT compatibility 2.9 or 3.0. Thus,
- // 4.2 binaries must allow starting up on 2.9 and 3.0 files.
- configStr = wtOpenConfig + ",compatibility=(require_min=\"3.0.0\")";
- ret = wiredtiger_open(path.c_str(), eventHandler, configStr.c_str(), connOut);
- if (!ret) {
- *fileVersionOut = {WiredTigerFileVersion::StartupVersion::IS_36};
- return;
- }
-
- configStr = wtOpenConfig + ",compatibility=(require_min=\"2.9.0\")";
- ret = wiredtiger_open(path.c_str(), eventHandler, configStr.c_str(), connOut);
- if (!ret) {
- *fileVersionOut = {WiredTigerFileVersion::StartupVersion::IS_34};
- return;
- }
-
- severe() << "Failed to start up WiredTiger under any compatibility version.";
- if (ret == EINVAL) {
- fassertFailedNoTrace(28561);
- }
-
- severe() << "Reason: " << wtRCToStatus(ret).reason();
- severe() << "Failed to open a WiredTiger connection. This may be due to metadata corruption. "
- << kWTRepairMsg;
- fassertFailedNoTrace(28595);
-}
-} // namespace
-
using std::set;
using std::string;
@@ -572,7 +531,7 @@ WiredTigerKVEngine::WiredTigerKVEngine(const std::string& canonicalName,
string config = ss.str();
log() << "wiredtiger_open config: " << config;
- openWiredTiger(path, _eventHandler.getWtEventHandler(), config, &_conn, &_fileVersion);
+ _openWiredTiger(path, config);
_eventHandler.setStartupSuccessful();
_wtOpenConfig = config;
@@ -645,6 +604,64 @@ void WiredTigerKVEngine::appendGlobalStats(BSONObjBuilder& b) {
bb.done();
}
+void WiredTigerKVEngine::_openWiredTiger(const std::string& path, const std::string& wtOpenConfig) {
+ std::string configStr = wtOpenConfig + ",compatibility=(require_min=\"3.1.0\")";
+
+ auto wtEventHandler = _eventHandler.getWtEventHandler();
+
+ int ret = wiredtiger_open(path.c_str(), wtEventHandler, configStr.c_str(), &_conn);
+ if (!ret) {
+ _fileVersion = {WiredTigerFileVersion::StartupVersion::IS_40};
+ return;
+ }
+
+ // Arbiters do not replicate the FCV document. Due to arbiter FCV semantics on 4.0, shutting
+ // down a 4.0 arbiter may either downgrade the data files to WT compatibility 2.9 or 3.0. Thus,
+ // 4.2 binaries must allow starting up on 2.9 and 3.0 files.
+ configStr = wtOpenConfig + ",compatibility=(require_min=\"3.0.0\")";
+ ret = wiredtiger_open(path.c_str(), wtEventHandler, configStr.c_str(), &_conn);
+ if (!ret) {
+ _fileVersion = {WiredTigerFileVersion::StartupVersion::IS_36};
+ return;
+ }
+
+ configStr = wtOpenConfig + ",compatibility=(require_min=\"2.9.0\")";
+ ret = wiredtiger_open(path.c_str(), wtEventHandler, configStr.c_str(), &_conn);
+ if (!ret) {
+ _fileVersion = {WiredTigerFileVersion::StartupVersion::IS_34};
+ return;
+ }
+
+ warning() << "Failed to start up WiredTiger under any compatibility version.";
+ if (ret == EINVAL) {
+ fassertFailedNoTrace(28561);
+ }
+
+ if (ret == WT_TRY_SALVAGE) {
+ warning() << "WiredTiger metadata corruption detected";
+
+ if (!_inRepairMode) {
+ severe() << kWTRepairMsg;
+ fassertFailedNoTrace(50944);
+ }
+
+ warning() << "Attempting to salvage WiredTiger metadata";
+ configStr = wtOpenConfig + ",salvage=true";
+ ret = wiredtiger_open(path.c_str(), wtEventHandler, configStr.c_str(), &_conn);
+ if (!ret) {
+ StorageRepairObserver::get(getGlobalServiceContext())
+ ->onModification("WiredTiger metadata salvaged");
+ return;
+ }
+
+ severe() << "Failed to salvage WiredTiger metadata: " + wtRCToStatus(ret).reason();
+ fassertFailedNoTrace(50947);
+ }
+
+ severe() << "Reason: " << wtRCToStatus(ret).reason();
+ fassertFailedNoTrace(28595);
+}
+
void WiredTigerKVEngine::cleanShutdown() {
log() << "WiredTigerKVEngine shutting down";
if (!_readOnly)
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h
index f466a4eb35e..32d8e93d438 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h
@@ -336,6 +336,15 @@ private:
class WiredTigerJournalFlusher;
class WiredTigerCheckpointThread;
+ /**
+ * Opens a connection on the WiredTiger database 'path' with the configuration 'wtOpenConfig'.
+ * Only returns when successful. Intializes both '_conn' and '_fileVersion'.
+ *
+ * If corruption is detected and _inRepairMode is 'true', attempts to salvage the WiredTiger
+ * metadata.
+ */
+ void _openWiredTiger(const std::string& path, const std::string& wtOpenConfig);
+
Status _salvageIfNeeded(const char* uri);
void _ensureIdentPath(StringData ident);
@@ -383,6 +392,7 @@ private:
void _setOldestTimestamp(Timestamp newOldestTimestamp, bool force);
WT_CONNECTION* _conn;
+ WiredTigerFileVersion _fileVersion;
WiredTigerEventHandler _eventHandler;
std::unique_ptr<WiredTigerSessionCache> _sessionCache;
ClockSource* const _clockSource;
@@ -418,7 +428,6 @@ private:
std::unique_ptr<WiredTigerSession> _backupSession;
Timestamp _recoveryTimestamp;
- WiredTigerFileVersion _fileVersion;
// Tracks the stable and oldest timestamps we've set on the storage engine.
AtomicWord<std::uint64_t> _oldestTimestamp;
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp
index 8021fafd771..1007eb0193c 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp
@@ -131,7 +131,7 @@ WT_CURSOR* WiredTigerSession::getCursor(const std::string& uri, uint64_t id, boo
}
if (ret != 0) {
error() << "Failed to open a WiredTiger cursor: " << uri;
- error() << "This may be due to metadata corruption. " << kWTRepairMsg;
+ error() << "This may be due to data corruption. " << kWTRepairMsg;
fassertFailedNoTrace(50882);
}
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp
index 15e5913a4d3..004062a6d04 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp
@@ -61,7 +61,8 @@ Status wtRCToStatus_slow(int retCode, const char* prefix) {
throw WriteConflictException();
}
- fassert(28559, retCode != WT_PANIC);
+ // Don't abort on WT_PANIC when repairing, as the error will be handled at a higher layer.
+ fassert(28559, retCode != WT_PANIC || storageGlobalParams.repair);
str::stream s;
if (prefix)
@@ -403,6 +404,11 @@ int mdb_handle_error_with_startup_suppression(WT_EVENT_HANDLER* handler,
error() << "WiredTiger error (" << errorCode << ") " << redact(message)
<< " Raw: " << message;
+
+ // Don't abort on WT_PANIC when repairing, as the error will be handled at a higher layer.
+ if (storageGlobalParams.repair) {
+ return 0;
+ }
fassert(50853, errorCode != WT_PANIC);
} catch (...) {
std::terminate();
@@ -416,6 +422,11 @@ int mdb_handle_error(WT_EVENT_HANDLER* handler,
const char* message) {
try {
error() << "WiredTiger error (" << errorCode << ") " << redact(message);
+
+ // Don't abort on WT_PANIC when repairing, as the error will be handled at a higher layer.
+ if (storageGlobalParams.repair) {
+ return 0;
+ }
fassert(28558, errorCode != WT_PANIC);
} catch (...) {
std::terminate();