diff options
author | Louis Williams <louis.williams@mongodb.com> | 2018-09-17 12:19:59 -0400 |
---|---|---|
committer | Louis Williams <louis.williams@mongodb.com> | 2018-09-18 09:56:04 -0400 |
commit | 3acf1742d7bcec78997614edb1e8ef26ccf2331f (patch) | |
tree | cece832626c42671aee6f15343829b57b8be7adb | |
parent | 185e2c6c38e959bbad40b624c4afeef5743f9c03 (diff) | |
download | mongo-3acf1742d7bcec78997614edb1e8ef26ccf2331f.tar.gz |
SERVER-35629 Use WiredTiger salvage API to repair corrupt metadata
-rw-r--r-- | jstests/disk/wt_corrupt_file_errors.js | 2 | ||||
-rw-r--r-- | jstests/disk/wt_repair_corrupt_metadata.js | 81 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/SConscript | 10 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp | 103 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h | 11 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp | 2 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp | 13 |
7 files changed, 175 insertions, 47 deletions
diff --git a/jstests/disk/wt_corrupt_file_errors.js b/jstests/disk/wt_corrupt_file_errors.js index f6e7204ccbf..5f829e5de8f 100644 --- a/jstests/disk/wt_corrupt_file_errors.js +++ b/jstests/disk/wt_corrupt_file_errors.js @@ -47,7 +47,7 @@ const WiredTigerWTFile = dbpath + "WiredTiger.wt"; jsTestLog("corrupting WiredTiger.wt"); corruptFile(WiredTigerWTFile); - }, "Fatal Assertion 28595"); + }, "Fatal Assertion 50944"); /** * Test 4. Corrupt an index file. diff --git a/jstests/disk/wt_repair_corrupt_metadata.js b/jstests/disk/wt_repair_corrupt_metadata.js new file mode 100644 index 00000000000..16c12ac9bd7 --- /dev/null +++ b/jstests/disk/wt_repair_corrupt_metadata.js @@ -0,0 +1,81 @@ +/** + * Tests that --repair on WiredTiger correctly and gracefully handles corrupt metadata files. + * + * @tags: [requires_wiredtiger,requires_journaling] + */ + +(function() { + + load('jstests/disk/libs/wt_file_helper.js'); + + const baseName = "wt_repair_corrupt_metadata"; + const collName = "test"; + const dbpath = MongoRunner.dataPath + baseName + "/"; + + /** + * This test runs repair using a version of the WiredTiger.turtle file that has checkpoint + * information before the collection was created. The turtle file contains checkpoint + * information about the WiredTiger.wt file, so if these two files become out of sync, + * WiredTiger will have to attempt a salvage operation on the .wt file and rebuild the .turtle + * file. + * + * The expectation is that the metadata salvage will be successful, and that the collection will + * be recreated with all of its data. + */ + let runTest = function(mongodOptions) { + // Unfortunately using --nojournal triggers a WT_PANIC and aborts in debug builds, which the + // following test case can exercise. + // TODO: This return can be removed once WT-4310 is completed. + if (db.adminCommand('buildInfo').debug && mongodOptions.hasOwnProperty('nojournal')) { + jsTestLog( + "Skipping test case because this is a debug build and --nojournal was provided."); + return; + } + + resetDbpath(dbpath); + jsTestLog("Running test with args: " + tojson(mongodOptions)); + + const turtleFile = dbpath + "WiredTiger.turtle"; + const turtleFileWithoutCollection = dbpath + "WiredTiger.turtle.1"; + + let mongod = startMongodOnExistingPath(dbpath, mongodOptions); + + // Force a checkpoint and make a copy of the turtle file. + assert.commandWorked(mongod.getDB(baseName).adminCommand({fsync: 1})); + jsTestLog("Making copy of metadata file before creating the collection: " + + turtleFileWithoutCollection); + copyFile(turtleFile, turtleFileWithoutCollection); + + let testColl = mongod.getDB(baseName)[collName]; + assert.commandWorked(testColl.insert({a: 1})); + + // Force another checkpoint before a clean shutdown. + assert.commandWorked(mongod.getDB(baseName).adminCommand({fsync: 1})); + MongoRunner.stopMongod(mongod); + + // Guarantee the turtle files changed between checkpoints. + assert.neq(md5sumFile(turtleFileWithoutCollection), md5sumFile(turtleFile)); + + jsTestLog("Replacing metadata file with a version before the collection existed."); + removeFile(turtleFile); + copyFile(turtleFileWithoutCollection, turtleFile); + + assertRepairSucceeds(dbpath, mongod.port, mongodOptions); + + mongod = startMongodOnExistingPath(dbpath, mongodOptions); + testColl = mongod.getDB(baseName)[collName]; + + // The collection exists depite using an older turtle file because salvage is able to find + // the table in the WiredTiger.wt file. + assert(testColl.exists()); + // We can assert that the data exists because the salvage only took place on the metadata, + // not the data. + assert.eq(testColl.find({}).itcount(), 1); + MongoRunner.stopMongod(mongod); + }; + + // Repair may behave differently with journaling enabled or disabled, but the end result should + // be the same. + runTest({journal: ""}); + runTest({nojournal: ""}); +})(); diff --git a/src/mongo/db/storage/wiredtiger/SConscript b/src/mongo/db/storage/wiredtiger/SConscript index 62f80ce6869..295dfd6ff36 100644 --- a/src/mongo/db/storage/wiredtiger/SConscript +++ b/src/mongo/db/storage/wiredtiger/SConscript @@ -83,6 +83,7 @@ if wiredtiger: LIBDEPS_PRIVATE= [ '$BUILD_DIR/mongo/db/snapshot_window_options', '$BUILD_DIR/mongo/util/options_parser/options_parser', + '$BUILD_DIR/mongo/db/storage/storage_repair_observer', ], ) @@ -147,6 +148,7 @@ if wiredtiger: '$BUILD_DIR/mongo/util/clock_source_mock', ], LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/db/auth/authmocks', '$BUILD_DIR/mongo/db/repl/repl_coordinator_interface', '$BUILD_DIR/mongo/db/repl/replmocks', ], @@ -191,6 +193,7 @@ if wiredtiger: 'additional_wiredtiger_record_store_tests', ], LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/db/auth/authmocks', '$BUILD_DIR/mongo/db/repl/repl_coordinator_interface', '$BUILD_DIR/mongo/db/repl/replmocks', ], @@ -205,6 +208,7 @@ if wiredtiger: 'additional_wiredtiger_record_store_tests', ], LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/db/auth/authmocks', '$BUILD_DIR/mongo/db/repl/repl_coordinator_interface', '$BUILD_DIR/mongo/db/repl/replmocks', ], @@ -218,6 +222,9 @@ if wiredtiger: LIBDEPS=[ 'additional_wiredtiger_index_tests', ], + LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/db/auth/authmocks', + ] ) wtEnv.CppUnitTest( @@ -228,6 +235,9 @@ if wiredtiger: LIBDEPS=[ 'additional_wiredtiger_index_tests', ], + LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/db/auth/authmocks', + ] ) wtEnv.CppUnitTest( diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp index 79326f3f1cc..eb41d13245f 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp @@ -71,6 +71,7 @@ #include "mongo/db/storage/journal_listener.h" #include "mongo/db/storage/storage_file_util.h" #include "mongo/db/storage/storage_options.h" +#include "mongo/db/storage/storage_repair_observer.h" #include "mongo/db/storage/wiredtiger/wiredtiger_customization_hooks.h" #include "mongo/db/storage/wiredtiger/wiredtiger_extensions.h" #include "mongo/db/storage/wiredtiger/wiredtiger_global_options.h" @@ -159,48 +160,6 @@ std::string WiredTigerFileVersion::getDowngradeString() { return "compatibility=(release=3.1)"; } -namespace { -void openWiredTiger(const std::string& path, - WT_EVENT_HANDLER* eventHandler, - const std::string& wtOpenConfig, - WT_CONNECTION** connOut, - WiredTigerFileVersion* fileVersionOut) { - std::string configStr = wtOpenConfig + ",compatibility=(require_min=\"3.1.0\")"; - int ret = wiredtiger_open(path.c_str(), eventHandler, configStr.c_str(), connOut); - if (!ret) { - *fileVersionOut = {WiredTigerFileVersion::StartupVersion::IS_40}; - return; - } - - // Arbiters do not replicate the FCV document. Due to arbiter FCV semantics on 4.0, shutting - // down a 4.0 arbiter may either downgrade the data files to WT compatibility 2.9 or 3.0. Thus, - // 4.2 binaries must allow starting up on 2.9 and 3.0 files. - configStr = wtOpenConfig + ",compatibility=(require_min=\"3.0.0\")"; - ret = wiredtiger_open(path.c_str(), eventHandler, configStr.c_str(), connOut); - if (!ret) { - *fileVersionOut = {WiredTigerFileVersion::StartupVersion::IS_36}; - return; - } - - configStr = wtOpenConfig + ",compatibility=(require_min=\"2.9.0\")"; - ret = wiredtiger_open(path.c_str(), eventHandler, configStr.c_str(), connOut); - if (!ret) { - *fileVersionOut = {WiredTigerFileVersion::StartupVersion::IS_34}; - return; - } - - severe() << "Failed to start up WiredTiger under any compatibility version."; - if (ret == EINVAL) { - fassertFailedNoTrace(28561); - } - - severe() << "Reason: " << wtRCToStatus(ret).reason(); - severe() << "Failed to open a WiredTiger connection. This may be due to metadata corruption. " - << kWTRepairMsg; - fassertFailedNoTrace(28595); -} -} // namespace - using std::set; using std::string; @@ -572,7 +531,7 @@ WiredTigerKVEngine::WiredTigerKVEngine(const std::string& canonicalName, string config = ss.str(); log() << "wiredtiger_open config: " << config; - openWiredTiger(path, _eventHandler.getWtEventHandler(), config, &_conn, &_fileVersion); + _openWiredTiger(path, config); _eventHandler.setStartupSuccessful(); _wtOpenConfig = config; @@ -645,6 +604,64 @@ void WiredTigerKVEngine::appendGlobalStats(BSONObjBuilder& b) { bb.done(); } +void WiredTigerKVEngine::_openWiredTiger(const std::string& path, const std::string& wtOpenConfig) { + std::string configStr = wtOpenConfig + ",compatibility=(require_min=\"3.1.0\")"; + + auto wtEventHandler = _eventHandler.getWtEventHandler(); + + int ret = wiredtiger_open(path.c_str(), wtEventHandler, configStr.c_str(), &_conn); + if (!ret) { + _fileVersion = {WiredTigerFileVersion::StartupVersion::IS_40}; + return; + } + + // Arbiters do not replicate the FCV document. Due to arbiter FCV semantics on 4.0, shutting + // down a 4.0 arbiter may either downgrade the data files to WT compatibility 2.9 or 3.0. Thus, + // 4.2 binaries must allow starting up on 2.9 and 3.0 files. + configStr = wtOpenConfig + ",compatibility=(require_min=\"3.0.0\")"; + ret = wiredtiger_open(path.c_str(), wtEventHandler, configStr.c_str(), &_conn); + if (!ret) { + _fileVersion = {WiredTigerFileVersion::StartupVersion::IS_36}; + return; + } + + configStr = wtOpenConfig + ",compatibility=(require_min=\"2.9.0\")"; + ret = wiredtiger_open(path.c_str(), wtEventHandler, configStr.c_str(), &_conn); + if (!ret) { + _fileVersion = {WiredTigerFileVersion::StartupVersion::IS_34}; + return; + } + + warning() << "Failed to start up WiredTiger under any compatibility version."; + if (ret == EINVAL) { + fassertFailedNoTrace(28561); + } + + if (ret == WT_TRY_SALVAGE) { + warning() << "WiredTiger metadata corruption detected"; + + if (!_inRepairMode) { + severe() << kWTRepairMsg; + fassertFailedNoTrace(50944); + } + + warning() << "Attempting to salvage WiredTiger metadata"; + configStr = wtOpenConfig + ",salvage=true"; + ret = wiredtiger_open(path.c_str(), wtEventHandler, configStr.c_str(), &_conn); + if (!ret) { + StorageRepairObserver::get(getGlobalServiceContext()) + ->onModification("WiredTiger metadata salvaged"); + return; + } + + severe() << "Failed to salvage WiredTiger metadata: " + wtRCToStatus(ret).reason(); + fassertFailedNoTrace(50947); + } + + severe() << "Reason: " << wtRCToStatus(ret).reason(); + fassertFailedNoTrace(28595); +} + void WiredTigerKVEngine::cleanShutdown() { log() << "WiredTigerKVEngine shutting down"; if (!_readOnly) diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h index f466a4eb35e..32d8e93d438 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h @@ -336,6 +336,15 @@ private: class WiredTigerJournalFlusher; class WiredTigerCheckpointThread; + /** + * Opens a connection on the WiredTiger database 'path' with the configuration 'wtOpenConfig'. + * Only returns when successful. Intializes both '_conn' and '_fileVersion'. + * + * If corruption is detected and _inRepairMode is 'true', attempts to salvage the WiredTiger + * metadata. + */ + void _openWiredTiger(const std::string& path, const std::string& wtOpenConfig); + Status _salvageIfNeeded(const char* uri); void _ensureIdentPath(StringData ident); @@ -383,6 +392,7 @@ private: void _setOldestTimestamp(Timestamp newOldestTimestamp, bool force); WT_CONNECTION* _conn; + WiredTigerFileVersion _fileVersion; WiredTigerEventHandler _eventHandler; std::unique_ptr<WiredTigerSessionCache> _sessionCache; ClockSource* const _clockSource; @@ -418,7 +428,6 @@ private: std::unique_ptr<WiredTigerSession> _backupSession; Timestamp _recoveryTimestamp; - WiredTigerFileVersion _fileVersion; // Tracks the stable and oldest timestamps we've set on the storage engine. AtomicWord<std::uint64_t> _oldestTimestamp; diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp index 8021fafd771..1007eb0193c 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp @@ -131,7 +131,7 @@ WT_CURSOR* WiredTigerSession::getCursor(const std::string& uri, uint64_t id, boo } if (ret != 0) { error() << "Failed to open a WiredTiger cursor: " << uri; - error() << "This may be due to metadata corruption. " << kWTRepairMsg; + error() << "This may be due to data corruption. " << kWTRepairMsg; fassertFailedNoTrace(50882); } diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp index 15e5913a4d3..004062a6d04 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_util.cpp @@ -61,7 +61,8 @@ Status wtRCToStatus_slow(int retCode, const char* prefix) { throw WriteConflictException(); } - fassert(28559, retCode != WT_PANIC); + // Don't abort on WT_PANIC when repairing, as the error will be handled at a higher layer. + fassert(28559, retCode != WT_PANIC || storageGlobalParams.repair); str::stream s; if (prefix) @@ -403,6 +404,11 @@ int mdb_handle_error_with_startup_suppression(WT_EVENT_HANDLER* handler, error() << "WiredTiger error (" << errorCode << ") " << redact(message) << " Raw: " << message; + + // Don't abort on WT_PANIC when repairing, as the error will be handled at a higher layer. + if (storageGlobalParams.repair) { + return 0; + } fassert(50853, errorCode != WT_PANIC); } catch (...) { std::terminate(); @@ -416,6 +422,11 @@ int mdb_handle_error(WT_EVENT_HANDLER* handler, const char* message) { try { error() << "WiredTiger error (" << errorCode << ") " << redact(message); + + // Don't abort on WT_PANIC when repairing, as the error will be handled at a higher layer. + if (storageGlobalParams.repair) { + return 0; + } fassert(28558, errorCode != WT_PANIC); } catch (...) { std::terminate(); |