diff options
author | Geert Bosch <geert@mongodb.com> | 2018-07-06 15:34:20 -0400 |
---|---|---|
committer | Geert Bosch <geert@mongodb.com> | 2018-07-06 15:34:20 -0400 |
commit | ebe1ae8549dfc7ec7e12f9344c2da17a2ffb9acb (patch) | |
tree | 067c7d8b968b414812d2310764506ba8af98cf85 | |
parent | fda766f6be1a20fa28ce361511bc62e5c995186b (diff) | |
download | mongo-ebe1ae8549dfc7ec7e12f9344c2da17a2ffb9acb.tar.gz |
SERVER-35112 Remove MMAPv1 code
156 files changed, 89 insertions, 33895 deletions
diff --git a/jstests/noPassthroughWithMongod/repair_unsupported_options.js b/jstests/noPassthroughWithMongod/repair_unsupported_options.js new file mode 100644 index 00000000000..da73875e753 --- /dev/null +++ b/jstests/noPassthroughWithMongod/repair_unsupported_options.js @@ -0,0 +1,10 @@ +// SERVER-35112: Test that specifying removed MMAPv1 specific options gives correct error. +(function() { + 'use strict'; + db.repair_unsupported_options.drop(); + assert.commandWorked(db.repair_unsupported_options.insert({})); // Ensure database exists. + let badValue = (cmd) => assert.commandFailedWithCode(db.runCommand(cmd), ErrorCodes.BadValue); + badValue({repairDatabase: 1, preserveClonedFilesOnFailure: 1}); + badValue({repairDatabase: 1, backupOriginalFiles: 1}); + assert.commandWorked(db.runCommand({repairDatabase: 1, someRandomUnknownOption: 1})); +})(); diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript index 67d31359f66..b28085d4de5 100644 --- a/src/mongo/db/SConscript +++ b/src/mongo/db/SConscript @@ -471,7 +471,6 @@ env.Library( "mongod_options.cpp", ], LIBDEPS=[ - '$BUILD_DIR/mongo/db/storage/mmap_v1/mmap_v1_options', 'repl/repl_settings', 'repl/replica_set_messages', 'server_options_servers', @@ -695,6 +694,9 @@ env.Library( 'index/index_access_method', 'write_ops', ], + LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/db/commands/server_status_core', + ], ) env.Library( @@ -858,19 +860,6 @@ env.Library( ) env.Library( - target='prefetch', - source=[ - 'prefetch.cpp', - ], - LIBDEPS=[ - 'dbhelpers', - 'index/index_access_method', - '$BUILD_DIR/mongo/db/stats/timer_stats', - '$BUILD_DIR/mongo/db/storage/mmap_v1/mmap', - ], -) - -env.Library( target="rw_concern_d", source=[ "read_concern.cpp", @@ -886,6 +875,9 @@ env.Library( "storage/storage_options", "s/sharding", ], + LIBDEPS_PRIVATE=[ + "commands/server_status_core", + ], ) env.Library( @@ -896,7 +888,6 @@ env.Library( LIBDEPS=[ '$BUILD_DIR/mongo/db/catalog/collection', '$BUILD_DIR/mongo/db/catalog/database', - '$BUILD_DIR/mongo/db/storage/mmap_v1/repair_database_interface', 'background', 'logical_clock', ], @@ -920,6 +911,7 @@ env.Library( ], LIBDEPS_PRIVATE=[ '$BUILD_DIR/mongo/db/commands/fsync_locked', + 'commands/server_status_core', 'write_ops', ] ) @@ -1035,6 +1027,9 @@ env.Library( 'storage/storage_options', 'update/update_driver', ], + LIBDEPS_PRIVATE=[ + "commands/server_status_core", + ], ) env.Library( @@ -1047,7 +1042,6 @@ env.Library( LIBDEPS=[ "$BUILD_DIR/mongo/db/bson/dotted_path_support", "$BUILD_DIR/mongo/db/logical_time_metadata_hook", - "$BUILD_DIR/mongo/db/storage/mmap_v1/file_allocator", "$BUILD_DIR/mongo/db/ttl_collection_cache", "$BUILD_DIR/mongo/executor/network_interface_factory", "$BUILD_DIR/mongo/s/catalog/sharding_catalog_client_impl", @@ -1085,7 +1079,6 @@ env.Library( "op_observer_d", "ops/write_ops_parsers", "pipeline/aggregation", - "prefetch", "query_exec", "repair_database", "repl/bgsync", @@ -1111,8 +1104,6 @@ env.Library( "stats/top", "storage/devnull/storage_devnull", "storage/ephemeral_for_test/storage_ephemeral_for_test", - "storage/mmap_v1/mmap", - "storage/mmap_v1/storage_mmapv1", "storage/storage_engine_lock_file", "storage/storage_engine_metadata", "storage/storage_init_d", diff --git a/src/mongo/db/catalog/SConscript b/src/mongo/db/catalog/SConscript index 13287b8f11d..97f4402ed92 100644 --- a/src/mongo/db/catalog/SConscript +++ b/src/mongo/db/catalog/SConscript @@ -294,9 +294,9 @@ env.Library( '$BUILD_DIR/mongo/db/views/views_mongod', ], LIBDEPS_PRIVATE=[ + "$BUILD_DIR/mongo/db/commands/server_status_core", '$BUILD_DIR/mongo/db/logical_clock', '$BUILD_DIR/mongo/db/repl/repl_settings', - '$BUILD_DIR/mongo/db/storage/mmap_v1/mmap_v1_options', '$BUILD_DIR/mongo/db/storage/storage_engine_common', ], ) diff --git a/src/mongo/db/catalog/collection.cpp b/src/mongo/db/catalog/collection.cpp index 915980a34d3..d5a05ec5e02 100644 --- a/src/mongo/db/catalog/collection.cpp +++ b/src/mongo/db/catalog/collection.cpp @@ -56,7 +56,6 @@ #include "mongo/db/server_parameters.h" #include "mongo/db/service_context.h" #include "mongo/db/storage/key_string.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" #include "mongo/db/storage/record_fetcher.h" #include "mongo/db/storage/record_store.h" #include "mongo/db/update/update_driver.h" diff --git a/src/mongo/db/catalog/collection_impl.cpp b/src/mongo/db/catalog/collection_impl.cpp index 8ef6a71b469..f74fbabba89 100644 --- a/src/mongo/db/catalog/collection_impl.cpp +++ b/src/mongo/db/catalog/collection_impl.cpp @@ -64,7 +64,6 @@ #include "mongo/db/server_parameters.h" #include "mongo/db/service_context.h" #include "mongo/db/storage/key_string.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" #include "mongo/db/storage/record_fetcher.h" #include "mongo/db/storage/record_store.h" #include "mongo/db/update/update_driver.h" @@ -799,9 +798,6 @@ bool CollectionImpl::_enforceQuota(bool userEnforeQuota) const { if (!userEnforeQuota) return false; - if (!mmapv1GlobalOptions.quota) - return false; - if (_ns.db() == "local") return false; diff --git a/src/mongo/db/commands/SConscript b/src/mongo/db/commands/SConscript index 908214a6e2f..80078126aa1 100644 --- a/src/mongo/db/commands/SConscript +++ b/src/mongo/db/commands/SConscript @@ -167,10 +167,10 @@ env.Library( "fsync.cpp", ], LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/db/auth/authprivilege', '$BUILD_DIR/mongo/db/commands', '$BUILD_DIR/mongo/db/concurrency/write_conflict_exception', '$BUILD_DIR/mongo/db/curop', - '$BUILD_DIR/mongo/db/storage/mmap_v1/storage_mmapv1', 'fsync_locked', ] ) diff --git a/src/mongo/db/commands/dbcommands.cpp b/src/mongo/db/commands/dbcommands.cpp index e95e188ece6..90d2cd2d5f0 100644 --- a/src/mongo/db/commands/dbcommands.cpp +++ b/src/mongo/db/commands/dbcommands.cpp @@ -240,15 +240,16 @@ public: log() << "repairDatabase " << dbname; BackgroundOperation::assertNoBgOpInProgForDb(dbname); - e = cmdObj.getField("preserveClonedFilesOnFailure"); - bool preserveClonedFilesOnFailure = e.isBoolean() && e.boolean(); - e = cmdObj.getField("backupOriginalFiles"); - bool backupOriginalFiles = e.isBoolean() && e.boolean(); + uassert(ErrorCodes::BadValue, + "preserveClonedFilesOnFailure not supported", + !cmdObj.getField("preserveClonedFilesOnFailure").trueValue()); + uassert(ErrorCodes::BadValue, + "backupOriginalFiles not supported", + !cmdObj.getField("backupOriginalFiles").trueValue()); StorageEngine* engine = getGlobalServiceContext()->getStorageEngine(); repl::UnreplicatedWritesBlock uwb(opCtx); - Status status = repairDatabase( - opCtx, engine, dbname, preserveClonedFilesOnFailure, backupOriginalFiles); + Status status = repairDatabase(opCtx, engine, dbname); // Open database before returning DatabaseHolder::getDatabaseHolder().openDb(opCtx, dbname); diff --git a/src/mongo/db/commands/fsync.cpp b/src/mongo/db/commands/fsync.cpp index 1af900475e7..a72454ab115 100644 --- a/src/mongo/db/commands/fsync.cpp +++ b/src/mongo/db/commands/fsync.cpp @@ -48,7 +48,6 @@ #include "mongo/db/concurrency/write_conflict_exception.h" #include "mongo/db/db.h" #include "mongo/db/service_context.h" -#include "mongo/db/storage/mmap_v1/dur.h" #include "mongo/db/storage/storage_engine.h" #include "mongo/stdx/condition_variable.h" #include "mongo/util/assert_util.h" @@ -135,17 +134,6 @@ public: log() << "CMD fsync: sync:" << sync << " lock:" << lock; if (!lock) { - // the simple fsync command case - if (sync) { - // can this be GlobalRead? and if it can, it should be nongreedy. - Lock::GlobalWrite w(opCtx); - // TODO SERVER-26822: Replace MMAPv1 specific calls with ones that are storage - // engine agnostic. - getDur().commitNow(opCtx); - - // No WriteUnitOfWork needed, as this does no writes of its own. - } - // Take a global IS lock to ensure the storage engine is not shutdown Lock::GlobalLock global(opCtx, MODE_IS); StorageEngine* storageEngine = getGlobalServiceContext()->getStorageEngine(); @@ -345,16 +333,6 @@ void FSyncLockThread::run() { OperationContext& opCtx = *opCtxPtr; Lock::GlobalWrite global(&opCtx); // No WriteUnitOfWork needed - try { - // TODO SERVER-26822: Replace MMAPv1 specific calls with ones that are storage engine - // agnostic. - getDur().syncDataAndTruncateJournal(&opCtx); - } catch (const std::exception& e) { - error() << "error doing syncDataAndTruncateJournal: " << e.what(); - fsyncCmd.threadStatus = Status(ErrorCodes::CommandFailed, e.what()); - fsyncCmd.acquireFsyncLockSyncCV.notify_one(); - return; - } opCtx.lockState()->downgradeGlobalXtoSForMMAPV1(); StorageEngine* storageEngine = getGlobalServiceContext()->getStorageEngine(); diff --git a/src/mongo/db/db.cpp b/src/mongo/db/db.cpp index a7e7a7e9d0a..851175c1a8f 100644 --- a/src/mongo/db/db.cpp +++ b/src/mongo/db/db.cpp @@ -126,7 +126,6 @@ #include "mongo/db/startup_warnings_mongod.h" #include "mongo/db/stats/counters.h" #include "mongo/db/storage/encryption_hooks.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" #include "mongo/db/storage/storage_engine.h" #include "mongo/db/storage/storage_engine_init.h" #include "mongo/db/storage/storage_options.h" @@ -405,21 +404,12 @@ ExitCode _initAndListen(int listenPort) { uassert(10296, ss.str().c_str(), boost::filesystem::exists(storageGlobalParams.dbpath)); } - { - std::stringstream ss; - ss << "repairpath (" << storageGlobalParams.repairpath << ") does not exist"; - uassert(12590, ss.str().c_str(), boost::filesystem::exists(storageGlobalParams.repairpath)); - } - initializeSNMP(); if (!storageGlobalParams.readOnly) { boost::filesystem::remove_all(storageGlobalParams.dbpath + "/_tmp/"); } - if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalRecoverOnly) - return EXIT_NET_ERROR; - if (mongodGlobalParams.scriptingEnabled) { ScriptEngine::setup(); } diff --git a/src/mongo/db/index/SConscript b/src/mongo/db/index/SConscript index 6b0b9d12793..3ec4e5a7d90 100644 --- a/src/mongo/db/index/SConscript +++ b/src/mongo/db/index/SConscript @@ -85,7 +85,6 @@ serveronlyEnv.Library( '$BUILD_DIR/mongo/db/concurrency/write_conflict_exception', '$BUILD_DIR/mongo/db/repl/repl_coordinator_interface', '$BUILD_DIR/mongo/db/storage/encryption_hooks', - '$BUILD_DIR/mongo/db/storage/mmap_v1/btree', '$BUILD_DIR/mongo/db/storage/storage_options', '$BUILD_DIR/third_party/shim_snappy', 'index_descriptor', diff --git a/src/mongo/db/index/index_access_method.cpp b/src/mongo/db/index/index_access_method.cpp index f31dde359cb..7ab26cbb9c2 100644 --- a/src/mongo/db/index/index_access_method.cpp +++ b/src/mongo/db/index/index_access_method.cpp @@ -80,14 +80,6 @@ bool isMultikeyFromPaths(const MultikeyPaths& multikeyPaths) { } // namespace MONGO_EXPORT_SERVER_PARAMETER(failIndexKeyTooLong, bool, true); -// -// Comparison for external sorter interface -// - -// Defined in db/structure/btree/key.cpp -// XXX TODO: rename to something more descriptive, etc. etc. -int oldCompare(const BSONObj& l, const BSONObj& r, const Ordering& o); - class BtreeExternalSortComparison { public: BtreeExternalSortComparison(const BSONObj& ordering, IndexVersion version) @@ -98,12 +90,8 @@ public: typedef std::pair<BSONObj, RecordId> Data; int operator()(const Data& l, const Data& r) const { - int x = (_version == IndexVersion::kV0 - ? oldCompare(l.first, r.first, _ordering) - : l.first.woCompare(r.first, _ordering, /*considerfieldname*/ false)); - if (x) { + if (int x = l.first.woCompare(r.first, _ordering, /*considerfieldname*/ false)) return x; - } return l.second.compare(r.second); } @@ -497,11 +485,6 @@ Status IndexAccessMethod::commitBulk(OperationContext* opCtx, } WriteUnitOfWork wunit(opCtx); - // Improve performance in the btree-building phase by disabling rollback tracking. - // This avoids copying all the written bytes to a buffer that is only used to roll back. - // Note that this is safe to do, as this entire index-build-in-progress will be cleaned - // up by the index system. - opCtx->recoveryUnit()->setRollbackWritesDisabled(); // Get the next datum and add it to the builder. BulkBuilder::Sorter::Data data = it->next(); diff --git a/src/mongo/db/mongod_options.cpp b/src/mongo/db/mongod_options.cpp index 510b62dc354..85d50af141f 100644 --- a/src/mongo/db/mongod_options.cpp +++ b/src/mongo/db/mongod_options.cpp @@ -44,7 +44,6 @@ #include "mongo/db/repl/repl_settings.h" #include "mongo/db/server_options.h" #include "mongo/db/server_options_server_helpers.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" #include "mongo/util/log.h" #include "mongo/util/mongoutils/str.h" #include "mongo/util/net/ssl_options.h" @@ -222,42 +221,6 @@ Status addMongodOptions(moe::OptionSection* options) { .setSources(moe::SourceAllLegacy); storage_options - .addOptionChaining("storage.mmapv1.preallocDataFiles", - "", - moe::Bool, - "disable data file preallocation - will often hurt performance", - {"storage.preallocDataFiles"}) - .setSources(moe::SourceYAMLConfig); - - storage_options - .addOptionChaining("storage.mmapv1.nsSize", - "nssize", - moe::Int, - ".ns file size (in MB) for new databases", - {"storage.nsSize"}) - .setDefault(moe::Value(16)); - - storage_options - .addOptionChaining("storage.mmapv1.quota.enforced", - "quota", - moe::Switch, - "limits each database to a certain number of files (8 default)", - {"storage.quota.enforced"}) - .incompatibleWith("keyFile"); - - storage_options.addOptionChaining("storage.mmapv1.quota.maxFilesPerDB", - "quotaFiles", - moe::Int, - "number of files allowed per db, implies --quota", - {"storage.quota.maxFilesPerDB"}); - - storage_options.addOptionChaining("storage.mmapv1.smallFiles", - "smallfiles", - moe::Switch, - "use a smaller default file size", - {"storage.smallFiles"}); - - storage_options .addOptionChaining("storage.syncPeriodSecs", "syncdelay", moe::Double, @@ -272,11 +235,6 @@ Status addMongodOptions(moe::OptionSection* options) { storage_options.addOptionChaining("repair", "repair", moe::Switch, "run repair on all dbs") .setSources(moe::SourceAllLegacy); - storage_options.addOptionChaining("storage.repairPath", - "repairpath", - moe::String, - "root directory for repair files - defaults to dbpath"); - // Javascript Options general_options @@ -319,36 +277,6 @@ Status addMongodOptions(moe::OptionSection* options) { general_options.addOptionChaining("storage.journal.enabled", "", moe::Bool, "enable journaling") .setSources(moe::SourceYAMLConfig); - // Two ways to set durability diagnostic options. durOptions is deprecated - storage_options - .addOptionChaining("storage.mmapv1.journal.debugFlags", - "journalOptions", - moe::Int, - "journal diagnostic options", - {"storage.journal.debugFlags"}) - .incompatibleWith("durOptions"); - - storage_options - .addOptionChaining("durOptions", "durOptions", moe::Int, "durability diagnostic options") - .hidden() - .setSources(moe::SourceAllLegacy) - .incompatibleWith("storage.mmapv1.journal.debugFlags"); - - storage_options.addOptionChaining("storage.journal.commitIntervalMs", - "journalCommitInterval", - moe::Int, - "how often to group/batch commit (ms)", - {"storage.mmapv1.journal.commitIntervalMs"}); - - // Deprecated option that we don't want people to use for performance reasons - storage_options - .addOptionChaining("storage.mmapv1.journal.nopreallocj", - "nopreallocj", - moe::Switch, - "don't preallocate journal files") - .hidden() - .setSources(moe::SourceAll); - #if defined(__linux__) general_options.addOptionChaining( "shutdown", "shutdown", moe::Switch, "kill a running server (for init scripts)"); @@ -670,24 +598,6 @@ Status canonicalizeMongodOptions(moe::Environment* params) { } } - // "storage.mmapv1.journal.durOptions" comes from the config file, so override it - // if "durOptions" is set since that comes from the command line. - if (params->count("durOptions")) { - int durOptions; - Status ret = params->get("durOptions", &durOptions); - if (!ret.isOK()) { - return ret; - } - ret = params->remove("durOptions"); - if (!ret.isOK()) { - return ret; - } - ret = params->set("storage.mmapv1.journal.debugFlags", moe::Value(durOptions)); - if (!ret.isOK()) { - return ret; - } - } - // "security.authorization" comes from the config file, so override it if "auth" is // set since those come from the command line. if (params->count("auth")) { @@ -704,20 +614,6 @@ Status canonicalizeMongodOptions(moe::Environment* params) { } } - // "storage.mmapv1.preallocDataFiles" comes from the config file, so override it if "noprealloc" - // is set since that comes from the command line. - if (params->count("noprealloc")) { - Status ret = params->set("storage.mmapv1.preallocDataFiles", - moe::Value(!(*params)["noprealloc"].as<bool>())); - if (!ret.isOK()) { - return ret; - } - ret = params->remove("noprealloc"); - if (!ret.isOK()) { - return ret; - } - } - // "sharding.archiveMovedChunks" comes from the config file, so override it if // "noMoveParanoia" or "moveParanoia" are set since those come from the command line. if (params->count("noMoveParanoia")) { @@ -935,13 +831,6 @@ Status storeMongodOptions(const moe::Environment& params) { if (params.count("cpu")) { serverGlobalParams.cpu = params["cpu"].as<bool>(); } - if (params.count("storage.mmapv1.quota.enforced")) { - mmapv1GlobalOptions.quota = params["storage.mmapv1.quota.enforced"].as<bool>(); - } - if (params.count("storage.mmapv1.quota.maxFilesPerDB")) { - mmapv1GlobalOptions.quota = true; - mmapv1GlobalOptions.quotaFiles = params["storage.mmapv1.quota.maxFilesPerDB"].as<int>() - 1; - } if (params.count("storage.journal.enabled")) { storageGlobalParams.dur = params["storage.journal.enabled"].as<bool>(); @@ -961,12 +850,6 @@ Status storeMongodOptions(const moe::Environment& params) { << "ms)"); } } - if (params.count("storage.mmapv1.journal.debugFlags")) { - mmapv1GlobalOptions.journalOptions = params["storage.mmapv1.journal.debugFlags"].as<int>(); - } - if (params.count("storage.mmapv1.journal.nopreallocj")) { - mmapv1GlobalOptions.preallocj = !params["storage.mmapv1.journal.nopreallocj"].as<bool>(); - } if (params.count("security.javascriptEnabled")) { mongodGlobalParams.scriptingEnabled = params["security.javascriptEnabled"].as<bool>(); @@ -984,14 +867,6 @@ Status storeMongodOptions(const moe::Environment& params) { } } - if (params.count("storage.mmapv1.preallocDataFiles")) { - mmapv1GlobalOptions.prealloc = params["storage.mmapv1.preallocDataFiles"].as<bool>(); - log() << "note: noprealloc may hurt performance in many applications" << endl; - } - if (params.count("storage.mmapv1.smallFiles")) { - mmapv1GlobalOptions.smallfiles = params["storage.mmapv1.smallFiles"].as<bool>(); - } - if (params.count("repair") && params["repair"].as<bool>() == true) { storageGlobalParams.upgrade = 1; // --repair implies --upgrade storageGlobalParams.repair = 1; @@ -1028,14 +903,6 @@ Status storeMongodOptions(const moe::Environment& params) { serverGlobalParams.indexBuildRetry = params["storage.indexBuildRetry"].as<bool>(); } - if (params.count("storage.mmapv1.nsSize")) { - int x = params["storage.mmapv1.nsSize"].as<int>(); - if (x <= 0 || x > (0x7fffffff / 1024 / 1024)) { - return Status(ErrorCodes::BadValue, "bad --nssize arg"); - } - mmapv1GlobalOptions.lenForNewNsFiles = x * 1024 * 1024; - verify(mmapv1GlobalOptions.lenForNewNsFiles > 0); - } if (params.count("replication.oplogSizeMB")) { long long x = params["replication.oplogSizeMB"].as<int>(); if (x <= 0) { @@ -1134,23 +1001,6 @@ Status storeMongodOptions(const moe::Environment& params) { } #endif - // needs to be after things like --configsvr parsing, thus here. - if (params.count("storage.repairPath")) { - storageGlobalParams.repairpath = params["storage.repairPath"].as<std::string>(); - if (!storageGlobalParams.repairpath.size()) { - return Status(ErrorCodes::BadValue, "repairpath is empty"); - } - - if (storageGlobalParams.dur && - !str::startsWith(storageGlobalParams.repairpath, storageGlobalParams.dbpath)) { - return Status(ErrorCodes::BadValue, - "You must use a --repairpath that is a subdirectory of --dbpath when " - "using journaling"); - } - } else { - storageGlobalParams.repairpath = storageGlobalParams.dbpath; - } - // Check if we are 32 bit and have not explicitly specified any journaling options if (sizeof(void*) == 4 && !params.count("storage.journal.enabled")) { // trying to make this stand out more like startup warnings diff --git a/src/mongo/db/prefetch.cpp b/src/mongo/db/prefetch.cpp deleted file mode 100644 index a55993037bc..00000000000 --- a/src/mongo/db/prefetch.cpp +++ /dev/null @@ -1,274 +0,0 @@ -/** - * Copyright (C) 2008-2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kReplication - -#include "mongo/platform/basic.h" - -#include "mongo/db/prefetch.h" - -#include "mongo/db/catalog/collection.h" -#include "mongo/db/catalog/database.h" -#include "mongo/db/catalog/index_catalog.h" -#include "mongo/db/commands/server_status_metric.h" -#include "mongo/db/concurrency/d_concurrency.h" -#include "mongo/db/dbhelpers.h" -#include "mongo/db/index/index_access_method.h" -#include "mongo/db/jsobj.h" -#include "mongo/db/repl/oplog_entry.h" -#include "mongo/db/repl/repl_settings.h" -#include "mongo/db/repl/replication_coordinator.h" -#include "mongo/db/server_parameters.h" -#include "mongo/db/stats/timer_stats.h" -#include "mongo/db/storage/mmap_v1/mmap.h" -#include "mongo/util/log.h" - -namespace mongo { - -using std::endl; -using std::string; - -namespace repl { -namespace { -// todo / idea: the prefetcher, when it fetches _id, on an upsert, will see if the record exists. if -// it does not, at write time, we can just do an insert, which will be faster. - -// The count (of batches) and time spent fetching pages before application -// -- meaning depends on the prefetch behavior: all, _id index, none, etc.) -TimerStats prefetchIndexStats; -ServerStatusMetricField<TimerStats> displayPrefetchIndexPages("repl.preload.indexes", - &prefetchIndexStats); -TimerStats prefetchDocStats; -ServerStatusMetricField<TimerStats> displayPrefetchDocPages("repl.preload.docs", &prefetchDocStats); - -// page in pages needed for all index lookups on a given object -void prefetchIndexPages(OperationContext* opCtx, - Collection* collection, - const ReplSettings::IndexPrefetchConfig& prefetchConfig, - const BSONObj& obj) { - // do we want prefetchConfig to be (1) as-is, (2) for update ops only, or (3) configured per op - // type? One might want PREFETCH_NONE for updates, but it's more rare that it is a bad idea for - // inserts. #3 (per op), a big issue would be "too many knobs". - switch (prefetchConfig) { - case ReplSettings::IndexPrefetchConfig::PREFETCH_NONE: - return; - case ReplSettings::IndexPrefetchConfig::PREFETCH_ID_ONLY: { - TimerHolder timer(&prefetchIndexStats); - // on the update op case, the call to prefetchRecordPages will touch the _id index. - // thus perhaps this option isn't very useful? - try { - IndexDescriptor* desc = collection->getIndexCatalog()->findIdIndex(opCtx); - if (!desc) - return; - IndexAccessMethod* iam = collection->getIndexCatalog()->getIndex(desc); - invariant(iam); - iam->touch(opCtx, obj).transitional_ignore(); - } catch (const DBException& e) { - LOG(2) << "ignoring exception in prefetchIndexPages(): " << redact(e); - } - break; - } - case ReplSettings::IndexPrefetchConfig::PREFETCH_ALL: { - // indexCount includes all indexes, including ones - // in the process of being built - IndexCatalog::IndexIterator ii = - collection->getIndexCatalog()->getIndexIterator(opCtx, true); - while (ii.more()) { - TimerHolder timer(&prefetchIndexStats); - // This will page in all index pages for the given object. - try { - IndexDescriptor* desc = ii.next(); - IndexAccessMethod* iam = collection->getIndexCatalog()->getIndex(desc); - verify(iam); - iam->touch(opCtx, obj).transitional_ignore(); - } catch (const DBException& e) { - LOG(2) << "ignoring exception in prefetchIndexPages(): " << redact(e); - } - } - break; - } - default: - fassertFailed(16427); - } -} - -// page in the data pages for a record associated with an object -void prefetchRecordPages(OperationContext* opCtx, - Database* db, - const char* ns, - const BSONObj& obj) { - BSONElement _id; - if (obj.getObjectID(_id)) { - TimerHolder timer(&prefetchDocStats); - BSONObjBuilder builder; - builder.append(_id); - BSONObj result; - try { - if (Helpers::findById(opCtx, db, ns, builder.done(), result)) { - // do we want to use Record::touch() here? it's pretty similar. - // volatile - avoid compiler optimizations for touching a mmap page - volatile char _dummy_char = '\0'; // NOLINT - - // Touch the first word on every page in order to fault it into memory - for (int i = 0; i < result.objsize(); i += getMinOSPageSizeBytes()) { - _dummy_char += *(result.objdata() + i); - } - // hit the last page, in case we missed it above - _dummy_char += *(result.objdata() + result.objsize() - 1); - } - } catch (const DBException& e) { - LOG(2) << "ignoring exception in prefetchRecordPages(): " << redact(e); - } - } -} -} // namespace - -// prefetch for an oplog operation -void prefetchPagesForReplicatedOp(OperationContext* opCtx, - Database* db, - const OplogEntry& oplogEntry) { - invariant(db); - const ReplSettings::IndexPrefetchConfig prefetchConfig = - ReplicationCoordinator::get(opCtx)->getIndexPrefetchConfig(); - - // Prefetch ignores non-CRUD operations. - if (!oplogEntry.isCrudOpType()) { - return; - } - - // This will have to change for engines other than MMAP V1, because they might not have - // means for directly prefetching pages from the collection. For this purpose, acquire S - // lock on the database, instead of optimizing with IS. - const auto& nss = oplogEntry.getNamespace(); - Lock::CollectionLock collLock(opCtx->lockState(), nss.ns(), MODE_S); - - Collection* collection = db->getCollection(opCtx, nss); - if (!collection) { - return; - } - - auto opType = oplogEntry.getOpType(); - LOG(4) << "index prefetch for op " << OpType_serializer(opType); - - // should we prefetch index pages on updates? if the update is in-place and doesn't change - // indexed values, it is actually slower - a lot slower if there are a dozen indexes or - // lots of multikeys. possible variations (not all mutually exclusive): - // 1) current behavior: full prefetch - // 2) don't do it for updates - // 3) don't do multikey indexes for updates - // 4) don't prefetchIndexPages on some heuristic; e.g., if it's an $inc. - // 5) if not prefetching index pages (#2), we should do it if we are upsertings and it - // will be an insert. to do that we could do the prefetchRecordPage first and if DNE - // then we do #1. - // - // note that on deletes 'obj' does not have all the keys we would want to prefetch on. - // a way to achieve that would be to prefetch the record first, and then afterwards do - // this part. - // - auto obj = oplogEntry.getOperationToApply(); - invariant(!obj.isEmpty()); - prefetchIndexPages(opCtx, collection, prefetchConfig, obj); - - // do not prefetch the data for inserts; it doesn't exist yet - // - // we should consider doing the record prefetch for the delete op case as we hit the record - // when we delete. note if done we only want to touch the first page. - // - // update: do record prefetch. - if ((opType == OpTypeEnum::kUpdate) && - // do not prefetch the data for capped collections because - // they typically do not have an _id index for findById() to use. - !collection->isCapped()) { - prefetchRecordPages(opCtx, db, nss.ns().c_str(), obj); - } -} - -class ReplIndexPrefetch : public ServerParameter { -public: - ReplIndexPrefetch() : ServerParameter(ServerParameterSet::getGlobal(), "replIndexPrefetch") {} - - virtual ~ReplIndexPrefetch() {} - - const char* _value() { - if (ReplicationCoordinator::get(getGlobalServiceContext())->getReplicationMode() != - ReplicationCoordinator::modeReplSet) { - return "uninitialized"; - } - ReplSettings::IndexPrefetchConfig ip = - ReplicationCoordinator::get(getGlobalServiceContext())->getIndexPrefetchConfig(); - switch (ip) { - case ReplSettings::IndexPrefetchConfig::PREFETCH_NONE: - return "none"; - case ReplSettings::IndexPrefetchConfig::PREFETCH_ID_ONLY: - return "_id_only"; - case ReplSettings::IndexPrefetchConfig::PREFETCH_ALL: - return "all"; - default: - return "invalid"; - } - } - - virtual void append(OperationContext* opCtx, BSONObjBuilder& b, const string& name) { - b.append(name, _value()); - } - - virtual Status set(const BSONElement& newValueElement) { - if (ReplicationCoordinator::get(getGlobalServiceContext())->getReplicationMode() != - ReplicationCoordinator::modeReplSet) { - return Status(ErrorCodes::BadValue, "replication is not enabled"); - } - - std::string prefetch = newValueElement.valuestrsafe(); - return setFromString(prefetch); - } - - virtual Status setFromString(const string& prefetch) { - log() << "changing replication index prefetch behavior to " << prefetch; - - ReplSettings::IndexPrefetchConfig prefetchConfig; - - if (prefetch == "none") - prefetchConfig = ReplSettings::IndexPrefetchConfig::PREFETCH_NONE; - else if (prefetch == "_id_only") - prefetchConfig = ReplSettings::IndexPrefetchConfig::PREFETCH_ID_ONLY; - else if (prefetch == "all") - prefetchConfig = ReplSettings::IndexPrefetchConfig::PREFETCH_ALL; - else { - return Status(ErrorCodes::BadValue, - str::stream() << "unrecognized indexPrefetch setting: " << prefetch); - } - - ReplicationCoordinator::get(getGlobalServiceContext()) - ->setIndexPrefetchConfig(prefetchConfig); - return Status::OK(); - } - -} replIndexPrefetch; - -} // namespace repl -} // namespace mongo diff --git a/src/mongo/db/prefetch.h b/src/mongo/db/prefetch.h deleted file mode 100644 index 1f5576e31e7..00000000000 --- a/src/mongo/db/prefetch.h +++ /dev/null @@ -1,46 +0,0 @@ -/** -* Copyright (C) 2008 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ -#pragma once - -namespace mongo { - -class BSONObj; -class Database; -class OperationContext; - -namespace repl { - -class OplogEntry; - -// page in possible index and/or data pages for an op from the oplog -void prefetchPagesForReplicatedOp(OperationContext* opCtx, - Database* db, - const OplogEntry& oplogEntry); - -} // namespace repl -} // namespace mongo diff --git a/src/mongo/db/repair_database.cpp b/src/mongo/db/repair_database.cpp index df89ce310c6..eafca15fe15 100644 --- a/src/mongo/db/repair_database.cpp +++ b/src/mongo/db/repair_database.cpp @@ -51,7 +51,6 @@ #include "mongo/db/catalog/uuid_catalog.h" #include "mongo/db/index/index_descriptor.h" #include "mongo/db/logical_clock.h" -#include "mongo/db/storage/mmap_v1/repair_database_interface.h" #include "mongo/db/storage/storage_engine.h" #include "mongo/util/log.h" #include "mongo/util/scopeguard.h" @@ -230,11 +229,7 @@ Status rebuildIndexesOnCollection(OperationContext* opCtx, return Status::OK(); } -Status repairDatabase(OperationContext* opCtx, - StorageEngine* engine, - const std::string& dbName, - bool preserveClonedFilesOnFailure, - bool backupOriginalFiles) { +Status repairDatabase(OperationContext* opCtx, StorageEngine* engine, const std::string& dbName) { DisableDocumentValidation validationDisabler(opCtx); // We must hold some form of lock here @@ -247,24 +242,6 @@ Status repairDatabase(OperationContext* opCtx, opCtx->checkForInterrupt(); - if (engine->isMmapV1()) { - // MMAPv1 is a layering violation so it implements its own repairDatabase. Call through a - // shimmed interface, so the symbol can exist independent of mmapv1. - auto status = repairDatabaseMmapv1( - engine, opCtx, dbName, preserveClonedFilesOnFailure, backupOriginalFiles); - // Restore oplog Collection pointer cache. - repl::acquireOplogCollectionForLogging(opCtx); - return status; - } - - // These are MMAPv1 specific - if (preserveClonedFilesOnFailure) { - return Status(ErrorCodes::BadValue, "preserveClonedFilesOnFailure not supported"); - } - if (backupOriginalFiles) { - return Status(ErrorCodes::BadValue, "backupOriginalFiles not supported"); - } - // Close the db and invalidate all current users and caches. DatabaseHolder::getDatabaseHolder().close(opCtx, dbName, "database closed for repair"); ON_BLOCK_EXIT([&dbName, &opCtx] { diff --git a/src/mongo/db/repair_database.h b/src/mongo/db/repair_database.h index 1aa3d4bb911..55dbc05b52b 100644 --- a/src/mongo/db/repair_database.h +++ b/src/mongo/db/repair_database.h @@ -73,9 +73,5 @@ Status rebuildIndexesOnCollection(OperationContext* opCtx, * Some data may be lost or modified in the process but the output will * be structurally valid on successful return. */ -Status repairDatabase(OperationContext* opCtx, - StorageEngine* engine, - const std::string& dbName, - bool preserveClonedFilesOnFailure = false, - bool backupOriginalFiles = false); +Status repairDatabase(OperationContext* opCtx, StorageEngine* engine, const std::string& dbName); } diff --git a/src/mongo/db/repair_database_and_check_version.cpp b/src/mongo/db/repair_database_and_check_version.cpp index 6b9d121f9dd..54be809d90e 100644 --- a/src/mongo/db/repair_database_and_check_version.cpp +++ b/src/mongo/db/repair_database_and_check_version.cpp @@ -50,7 +50,6 @@ #include "mongo/db/repl/drop_pending_collection_reaper.h" #include "mongo/db/repl/replication_coordinator.h" #include "mongo/db/server_options.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" #include "mongo/util/exit.h" #include "mongo/util/log.h" #include "mongo/util/quick_exit.h" @@ -501,7 +500,7 @@ StatusWith<bool> repairDatabasesAndCheckVersion(OperationContext* opCtx) { if (replSettings.usingReplSets()) { // We only care about _id indexes and drop-pending collections if we are in a replset. checkForIdIndexesAndDropPendingCollections(opCtx, db); - // Ensure oplog is capped (mmap does not guarantee order of inserts on noncapped + // Ensure oplog is capped (mongodb does not guarantee order of inserts on noncapped // collections) if (db->name() == "local") { checkForCappedOplog(opCtx, db); @@ -519,13 +518,7 @@ StatusWith<bool> repairDatabasesAndCheckVersion(OperationContext* opCtx) { if (!fcvDocumentExists && nonLocalDatabases) { severe() << "Unable to start up mongod due to missing featureCompatibilityVersion document."; - if (opCtx->getServiceContext()->getStorageEngine()->isMmapV1()) { - severe() << "Please run with --journalOptions " - << static_cast<int>(MMAPV1Options::JournalRecoverOnly) - << " to recover the journal. Then run with --repair to restore the document."; - } else { - severe() << "Please run with --repair to restore the document."; - } + severe() << "Please run with --repair to restore the document."; fassertFailedNoTrace(40652); } diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript index e9a4998db59..e413e6d8323 100644 --- a/src/mongo/db/repl/SConscript +++ b/src/mongo/db/repl/SConscript @@ -659,7 +659,6 @@ env.Library( '$BUILD_DIR/mongo/db/concurrency/lock_manager', '$BUILD_DIR/mongo/db/concurrency/write_conflict_exception', '$BUILD_DIR/mongo/db/curop', - '$BUILD_DIR/mongo/db/prefetch', '$BUILD_DIR/mongo/db/query_exec', '$BUILD_DIR/mongo/db/s/sharding_runtime_d', '$BUILD_DIR/mongo/db/stats/timer_stats', diff --git a/src/mongo/db/repl/replication_coordinator_external_state.h b/src/mongo/db/repl/replication_coordinator_external_state.h index c9c6d4192b4..e70f68f47a8 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state.h +++ b/src/mongo/db/repl/replication_coordinator_external_state.h @@ -110,12 +110,6 @@ public: virtual ThreadPool* getDbWorkThreadPool() const = 0; /** - * Runs the repair database command on the "local" db, if the storage engine is MMapV1. - * Note: Used after initial sync to compact the database files. - */ - virtual Status runRepairOnLocalDB(OperationContext* opCtx) = 0; - - /** * Creates the oplog, writes the first entry and stores the replica set config document. */ virtual Status initializeReplSetStorage(OperationContext* opCtx, const BSONObj& config) = 0; diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index 809cb5140b5..a17d3506ca7 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -407,26 +407,6 @@ ThreadPool* ReplicationCoordinatorExternalStateImpl::getDbWorkThreadPool() const return _writerPool.get(); } -Status ReplicationCoordinatorExternalStateImpl::runRepairOnLocalDB(OperationContext* opCtx) { - try { - Lock::GlobalWrite globalWrite(opCtx); - StorageEngine* engine = getGlobalServiceContext()->getStorageEngine(); - - if (!engine->isMmapV1()) { - return Status::OK(); - } - - UnreplicatedWritesBlock uwb(opCtx); - Status status = repairDatabase(opCtx, engine, localDbName, false, false); - - // Open database before returning - DatabaseHolder::getDatabaseHolder().openDb(opCtx, localDbName); - } catch (const DBException& ex) { - return ex.toStatus(); - } - return Status::OK(); -} - Status ReplicationCoordinatorExternalStateImpl::initializeReplSetStorage(OperationContext* opCtx, const BSONObj& config) { try { diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.h b/src/mongo/db/repl/replication_coordinator_external_state_impl.h index 8607be1af20..797175b8111 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.h +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.h @@ -76,7 +76,6 @@ public: virtual void shutdown(OperationContext* opCtx); virtual executor::TaskExecutor* getTaskExecutor() const override; virtual ThreadPool* getDbWorkThreadPool() const override; - virtual Status runRepairOnLocalDB(OperationContext* opCtx) override; virtual Status initializeReplSetStorage(OperationContext* opCtx, const BSONObj& config); virtual void waitForAllEarlierOplogWritesToBeVisible(OperationContext* opCtx); void onDrainComplete(OperationContext* opCtx) override; diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp index 5a4ab25d6e4..30568571626 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.cpp @@ -72,10 +72,6 @@ void ReplicationCoordinatorExternalStateMock::startSteadyStateReplication(Operat void ReplicationCoordinatorExternalStateMock::stopDataReplication(OperationContext*) {} -Status ReplicationCoordinatorExternalStateMock::runRepairOnLocalDB(OperationContext* opCtx) { - return Status::OK(); -} - Status ReplicationCoordinatorExternalStateMock::initializeReplSetStorage(OperationContext* opCtx, const BSONObj& config) { return storeLocalConfigDocument(opCtx, config); diff --git a/src/mongo/db/repl/replication_coordinator_external_state_mock.h b/src/mongo/db/repl/replication_coordinator_external_state_mock.h index e8169616079..05d08eb0f6d 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_mock.h +++ b/src/mongo/db/repl/replication_coordinator_external_state_mock.h @@ -65,7 +65,6 @@ public: virtual void shutdown(OperationContext* opCtx); virtual executor::TaskExecutor* getTaskExecutor() const override; virtual ThreadPool* getDbWorkThreadPool() const override; - virtual Status runRepairOnLocalDB(OperationContext* opCtx) override; virtual Status initializeReplSetStorage(OperationContext* opCtx, const BSONObj& config); virtual void waitForAllEarlierOplogWritesToBeVisible(OperationContext* opCtx); void onDrainComplete(OperationContext* opCtx) override; diff --git a/src/mongo/db/repl/replication_coordinator_impl.cpp b/src/mongo/db/repl/replication_coordinator_impl.cpp index b6d02ba07c0..478713735b5 100644 --- a/src/mongo/db/repl/replication_coordinator_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_impl.cpp @@ -698,13 +698,11 @@ void ReplicationCoordinatorImpl::_startDataReplication(OperationContext* opCtx, if (startCompleted) { startCompleted(); } - // Repair local db (to compact it). - auto opCtxHolder = cc().makeOperationContext(); - uassertStatusOK(_externalState->runRepairOnLocalDB(opCtxHolder.get())); // Because initial sync completed, we can only be in STARTUP2, not REMOVED. // Transition from STARTUP2 to RECOVERING and start the producer and the applier. invariant(getMemberState().startup2()); invariant(setFollowerMode(MemberState::RS_RECOVERING)); + auto opCtxHolder = cc().makeOperationContext(); _externalState->startSteadyStateReplication(opCtxHolder.get(), this); }; diff --git a/src/mongo/db/repl/sync_tail.cpp b/src/mongo/db/repl/sync_tail.cpp index 300b2c67913..94fbca8fa6f 100644 --- a/src/mongo/db/repl/sync_tail.cpp +++ b/src/mongo/db/repl/sync_tail.cpp @@ -55,7 +55,6 @@ #include "mongo/db/logical_session_id.h" #include "mongo/db/multi_key_path_tracker.h" #include "mongo/db/namespace_string.h" -#include "mongo/db/prefetch.h" #include "mongo/db/query/query_knobs.h" #include "mongo/db/repl/applier_helpers.h" #include "mongo/db/repl/apply_ops.h" @@ -349,38 +348,6 @@ const OplogApplier::Options& SyncTail::getOptions() const { namespace { -// The pool threads call this to prefetch each op -void prefetchOp(const OplogEntry& oplogEntry) { - const auto& nss = oplogEntry.getNamespace(); - if (!nss.isEmpty()) { - try { - // one possible tweak here would be to stay in the read lock for this database - // for multiple prefetches if they are for the same database. - const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); - OperationContext& opCtx = *opCtxPtr; - AutoGetCollectionForReadCommand ctx(&opCtx, nss); - Database* db = ctx.getDb(); - if (db) { - prefetchPagesForReplicatedOp(&opCtx, db, oplogEntry); - } - } catch (const DBException& e) { - LOG(2) << "ignoring exception in prefetchOp(): " << redact(e) << endl; - } catch (const std::exception& e) { - log() << "Unhandled std::exception in prefetchOp(): " << redact(e.what()) << endl; - fassertFailed(16397); - } - } -} - -// Doles out all the work to the reader pool threads and waits for them to complete -void prefetchOps(const MultiApplier::Operations& ops, ThreadPool* prefetcherPool) { - invariant(prefetcherPool); - for (auto&& op : ops) { - invariant(prefetcherPool->schedule([&] { prefetchOp(op); })); - } - prefetcherPool->waitForIdle(); -} - // Doles out all the work to the writer pool threads. // Does not modify writerVectors, but passes non-const pointers to inner vectors into func. void applyOps(std::vector<MultiApplier::OperationPtrs>& writerVectors, @@ -1255,11 +1222,6 @@ Status multiSyncApply(OperationContext* opCtx, StatusWith<OpTime> SyncTail::multiApply(OperationContext* opCtx, MultiApplier::Operations ops) { invariant(!ops.empty()); - if (isMMAPV1()) { - // Use a ThreadPool to prefetch all the operations in a batch. - prefetchOps(ops, _writerPool); - } - LOG(2) << "replication batch size is " << ops.size(); // Stop all readers until we're done. This also prevents doc-locking engines from deleting old // entries from the oplog until we finish writing. diff --git a/src/mongo/db/storage/SConscript b/src/mongo/db/storage/SConscript index 0a2e629e9b6..988e5ffc40c 100644 --- a/src/mongo/db/storage/SConscript +++ b/src/mongo/db/storage/SConscript @@ -8,7 +8,6 @@ env.SConscript( 'devnull', 'ephemeral_for_test', 'kv', - 'mmap_v1', 'wiredtiger', 'mobile', ], @@ -246,9 +245,6 @@ env.Library( '$BUILD_DIR/mongo/base', '$BUILD_DIR/mongo/db/bson/dotted_path_support', ], - LIBDEPS_PRIVATE=[ - '$BUILD_DIR/mongo/db/storage/mmap_v1/paths', - ], ) env.CppUnitTest( diff --git a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.h b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.h index b5e99506bf8..b64ec36da99 100644 --- a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.h +++ b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_recovery_unit.h @@ -64,12 +64,6 @@ public: _changes.push_back(ChangePtr(change)); } - virtual void* writingPtr(void* data, size_t len) { - MONGO_UNREACHABLE; - } - - virtual void setRollbackWritesDisabled() {} - virtual SnapshotId getSnapshotId() const { return SnapshotId(); } diff --git a/src/mongo/db/storage/mmap_v1/SConscript b/src/mongo/db/storage/mmap_v1/SConscript deleted file mode 100644 index 857bcfac99f..00000000000 --- a/src/mongo/db/storage/mmap_v1/SConscript +++ /dev/null @@ -1,310 +0,0 @@ -# -*- mode: python -*- - -Import("env") -Import("mmapv1") - -env = env.Clone() - -env.Library( - target='paths', - source=[ - 'paths.cpp', - ], - LIBDEPS=[ - '$BUILD_DIR/mongo/base', - ], -) - -env.Library( - target='mmap_v1_options', - source=[ - 'mmap_v1_options.cpp', - ], -) - -env.Library( - target='storage_mmapv1', - source=[ - "aligned_builder.cpp", - "catalog/hashtab.cpp", - "catalog/index_details.cpp", - "catalog/namespace_details.cpp", - "catalog/namespace_details_collection_entry.cpp", - "catalog/namespace_details_rsv1_metadata.cpp", - "catalog/namespace_index.cpp", - "commit_notifier.cpp", - "data_file.cpp", - "data_file_sync.cpp", - "durable_mapped_file.cpp", - "dur.cpp", - "durop.cpp", - "dur_preplogbuffer.cpp", - "dur_commitjob.cpp", - "dur_recover.cpp", - "dur_journal.cpp", - "dur_journal_writer.cpp", - "dur_recovery_unit.cpp", - "journal_latency_test_cmd.cpp", - "mmap_v1_database_catalog_entry.cpp", - "mmap_v1_engine.cpp", - "mmap_v1_extent_manager.cpp", - "mmap_v1_init.cpp" if mmapv1 else "mmap_v1_noinit.cpp", - "repair_database.cpp", - ], - LIBDEPS=[ - 'record_store_v1', - 'record_access_tracker', - 'repair_database_interface', - 'btree', - 'file_allocator', - 'logfile', - 'compress', - 'paths', - 'mmap_v1_options', - '$BUILD_DIR/mongo/db/catalog/collection_options', - '$BUILD_DIR/mongo/db/catalog/database', - '$BUILD_DIR/mongo/db/catalog/database_holder', - '$BUILD_DIR/mongo/db/catalog/index_catalog', - '$BUILD_DIR/mongo/db/catalog/index_create', - '$BUILD_DIR/mongo/db/commands', - '$BUILD_DIR/mongo/db/concurrency/lock_manager', - '$BUILD_DIR/mongo/db/index_names', - '$BUILD_DIR/mongo/db/index/index_descriptor', - '$BUILD_DIR/mongo/db/storage/journal_listener', - '$BUILD_DIR/mongo/db/storage/kv/kv_prefix', - '$BUILD_DIR/mongo/db/storage/storage_engine_lock_file', - '$BUILD_DIR/mongo/db/storage/storage_engine_metadata', - '$BUILD_DIR/mongo/db/index/index_access_methods', - '$BUILD_DIR/mongo/db/write_ops', - ], - LIBDEPS_PRIVATE=[ - '$BUILD_DIR/mongo/db/commands/server_status', - '$BUILD_DIR/mongo/db/commands/test_commands_enabled', - '$BUILD_DIR/mongo/db/storage/storage_engine_common', - ], -) - -env.Library( - target = 'repair_database_interface', - source = [ - "repair_database_interface.cpp", - ], - LIBDEPS = [ - ], -) - -compressEnv = env.Clone() -compressEnv.InjectThirdPartyIncludePaths(libraries=['snappy']) -compressEnv -compressEnv.Library( - target='compress', - source=[ - 'compress.cpp', - ], - LIBDEPS=[ - 'paths', - '$BUILD_DIR/third_party/shim_snappy', - ], -) - -env.Library( - target= 'extent', - source= [ - 'extent.cpp', - 'extent_manager.cpp', - ], - LIBDEPS= [ - '$BUILD_DIR/mongo/base', - ] - ) - -env.Library( - target='file_allocator', - source=[ - 'file_allocator.cpp', - ], - LIBDEPS=[ - 'paths', - '$BUILD_DIR/mongo/util/fail_point', - '$BUILD_DIR/mongo/util/processinfo', - ], -) - -env.Library( - target='logfile', - source=[ - 'logfile.cpp', - ], - LIBDEPS=[ - 'mmap', - 'paths', - ], -) - -env.Library( - target='mmap', - source=[ - 'mmap.cpp', - 'mmap_${TARGET_OS_FAMILY}.cpp', - ], - LIBDEPS=[ - '$BUILD_DIR/mongo/db/concurrency/lock_manager', - '$BUILD_DIR/mongo/db/service_context', - '$BUILD_DIR/mongo/db/storage/storage_options', - '$BUILD_DIR/mongo/util/progress_meter', - 'file_allocator', - ], -) - -env.Library( - target= 'record_store_v1', - source= [ - 'record_store_v1_base.cpp', - 'record_store_v1_capped.cpp', - 'record_store_v1_capped_iterator.cpp', - 'record_store_v1_repair_iterator.cpp', - 'record_store_v1_simple.cpp', - 'record_store_v1_simple_iterator.cpp', - 'touch_pages.cpp', - ], - LIBDEPS= [ - '$BUILD_DIR/mongo/db/commands/server_status_core', - '$BUILD_DIR/mongo/db/curop', - '$BUILD_DIR/mongo/db/service_context', - '$BUILD_DIR/mongo/db/storage/storage_options', - '$BUILD_DIR/mongo/util/concurrency/spin_lock', - '$BUILD_DIR/mongo/util/progress_meter', - 'extent', - ] - ) - -env.Library( - target='record_store_v1_test_help', - source=['record_store_v1_test_help.cpp', - ], - LIBDEPS=[ - '$BUILD_DIR/mongo/unittest/unittest', - 'record_store_v1' - ] - ) - -env.Library( - target='record_access_tracker', - source=['record_access_tracker.cpp', - ], - LIBDEPS=[ - '$BUILD_DIR/mongo/base', - '$BUILD_DIR/mongo/util/net/network', - '$BUILD_DIR/mongo/util/processinfo', - ] - ) - -env.Library( - target= 'btree', - source= [ - 'btree/btree_logic.cpp', - 'btree/btree_interface.cpp', - 'btree/btree_ondisk.cpp', - 'btree/key.cpp' - ], - LIBDEPS= [ - '$BUILD_DIR/mongo/base', - '$BUILD_DIR/mongo/db/service_context', - 'record_store_v1', - ] - ) - -if mmapv1: - env.CppUnitTest( - target='storage_engine_mmap_v1_init_test', - source=['mmap_v1_init_test.cpp', - ], - LIBDEPS=[ - '$BUILD_DIR/mongo/db/auth/authmocks', - '$BUILD_DIR/mongo/db/serveronly', - '$BUILD_DIR/mongo/db/service_context', - '$BUILD_DIR/mongo/db/service_context_d', - '$BUILD_DIR/mongo/db/storage/storage_engine_metadata', - '$BUILD_DIR/mongo/db/storage/storage_options', - ], - ) - - env.CppUnitTest(target = 'record_access_tracker_test', - source = ['record_access_tracker_test.cpp'], - LIBDEPS = ['record_access_tracker', - '$BUILD_DIR/mongo/util/clock_source_mock', - '$BUILD_DIR/mongo/util/processinfo', - '$BUILD_DIR/mongo/util/net/network']) - - env.CppUnitTest(target = 'namespace_test', - source = ['catalog/namespace_test.cpp'], - LIBDEPS = ['$BUILD_DIR/mongo/base']) - - env.CppUnitTest( - target='record_store_v1_simple_test', - source=['record_store_v1_simple_test.cpp', - ], - LIBDEPS=[ - 'record_store_v1_test_help' - ] - ) - - env.CppUnitTest( - target='record_store_v1_capped_test', - source=['record_store_v1_capped_test.cpp', - ], - LIBDEPS=[ - 'record_store_v1_test_help' - ] - ) - - - env.CppUnitTest( - target='record_store_v1_test', - source=['mmap_v1_record_store_test.cpp', - ], - LIBDEPS=[ - 'record_store_v1_test_help', - '$BUILD_DIR/mongo/db/storage/record_store_test_harness' - ] - ) - - env.Library( - target= 'btree_test_help', - source= [ - 'btree/btree_test_help.cpp', - 'heap_record_store_btree.cpp' - ], - LIBDEPS= [ - 'btree', - 'record_store_v1_test_help', - ] - ) - - env.CppUnitTest( - target='btree_logic_test', - source=['btree/btree_logic_test.cpp' - ], - LIBDEPS=[ - 'btree_test_help' - ] - ) - - env.CppUnitTest( - target='btree_interface_test', - source=['btree/btree_interface_test.cpp' - ], - LIBDEPS=[ - 'btree_test_help', - '$BUILD_DIR/mongo/db/storage/sorted_data_interface_test_harness' - ] - ) - - env.CppUnitTest( - target='data_file_version_test', - source=[ - 'data_file_version_test.cpp', - ], - LIBDEPS=[ - ], - ) diff --git a/src/mongo/db/storage/mmap_v1/aligned_builder.cpp b/src/mongo/db/storage/mmap_v1/aligned_builder.cpp deleted file mode 100644 index 96e7ddd936e..00000000000 --- a/src/mongo/db/storage/mmap_v1/aligned_builder.cpp +++ /dev/null @@ -1,175 +0,0 @@ -/** - * Copyright (C) 2009-2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kDefault - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/aligned_builder.h" - -#include "mongo/base/static_assert.h" -#include "mongo/util/debug_util.h" -#include "mongo/util/log.h" - -namespace mongo { - -using std::endl; - -AlignedBuilder::AlignedBuilder(unsigned initSize) { - _len = 0; - _malloc(initSize); - uassert(13584, "out of memory AlignedBuilder", _p._allocationAddress); -} - -MONGO_STATIC_ASSERT(sizeof(void*) == sizeof(size_t)); - -/** reset for a re-use. shrinks if > 128MB */ -void AlignedBuilder::reset() { - _len = 0; - RARELY { - const unsigned sizeCap = 128 * 1024 * 1024; - if (_p._size > sizeCap) - _realloc(sizeCap, _len); - } -} - -/** reset with a hint as to the upcoming needed size specified */ -void AlignedBuilder::reset(unsigned sz) { - _len = 0; - unsigned Q = 32 * 1024 * 1024 - 1; - unsigned want = (sz + Q) & (~Q); - if (_p._size == want) { - return; - } - if (_p._size > want) { - if (_p._size <= 64 * 1024 * 1024) - return; - bool downsize = false; - RARELY { - downsize = true; - } - if (!downsize) - return; - } - _realloc(want, _len); -} - -void AlignedBuilder::mallocSelfAligned(unsigned sz) { - verify(sz == _p._size); - void* p = malloc(sz + Alignment - 1); - _p._allocationAddress = p; - size_t s = (size_t)p; - size_t sold = s; - s += Alignment - 1; - s = (s / Alignment) * Alignment; - verify(s >= sold); // beginning - verify((s + sz) <= (sold + sz + Alignment - 1)); // end - _p._data = (char*)s; -} - -/* "slow"/infrequent portion of 'grow()' */ -void NOINLINE_DECL AlignedBuilder::growReallocate(unsigned oldLen) { - const unsigned MB = 1024 * 1024; - const unsigned kMaxSize = (sizeof(int*) == 4) ? 512 * MB : 2000 * MB; - const unsigned kWarnSize = (sizeof(int*) == 4) ? 256 * MB : 512 * MB; - - const unsigned oldSize = _p._size; - - // Warn for unexpectedly large buffer - if (_len > kWarnSize) { - warning() << "large amount of uncommitted data (" << _len << " bytes)"; - } - - // Check validity of requested size - invariant(_len > oldSize); - if (_len > kMaxSize) { - error() << "error writing journal: too much uncommitted data (" << _len << " bytes)"; - error() << "shutting down immediately to avoid corruption"; - fassert(28614, _len <= kMaxSize); - } - - // Use smaller maximum for debug builds, as we should never be close the the maximum - dassert(_len <= 1000 * MB); - - // Compute newSize by doubling the existing maximum size until the maximum is reached - invariant(oldSize > 0); - uint64_t newSize = oldSize; // use 64 bits to defend against accidental overflow - while (newSize < _len) { - newSize *= 2; - } - - if (newSize > kMaxSize) { - newSize = kMaxSize; - } - - _realloc(newSize, oldLen); -} - -void AlignedBuilder::_malloc(unsigned sz) { - _p._size = sz; -#if defined(_WIN32) - void* p = VirtualAlloc(0, sz, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); - _p._allocationAddress = p; - _p._data = (char*)p; -#elif defined(__linux__) - // in theory #ifdef _POSIX_VERSION should work, but it doesn't on OS X 10.4, and needs to be - // tested on solaris. so for now, linux only for this. - void* p = 0; - int res = posix_memalign(&p, Alignment, sz); - massert(13524, "out of memory AlignedBuilder", res == 0); - _p._allocationAddress = p; - _p._data = (char*)p; -#else - mallocSelfAligned(sz); - verify(((size_t)_p._data) % Alignment == 0); -#endif -} - -void AlignedBuilder::_realloc(unsigned newSize, unsigned oldLen) { - // posix_memalign alignment is not maintained on reallocs, so we can't use realloc(). - AllocationInfo old = _p; - _malloc(newSize); - verify(oldLen <= _len); - memcpy(_p._data, old._data, oldLen); - _free(old._allocationAddress); -} - -void AlignedBuilder::_free(void* p) { -#if defined(_WIN32) - VirtualFree(p, 0, MEM_RELEASE); -#else - free(p); -#endif -} - -void AlignedBuilder::kill() { - _free(_p._allocationAddress); - _p._allocationAddress = 0; - _p._data = 0; -} -} diff --git a/src/mongo/db/storage/mmap_v1/aligned_builder.h b/src/mongo/db/storage/mmap_v1/aligned_builder.h deleted file mode 100644 index f43cbee7d5d..00000000000 --- a/src/mongo/db/storage/mmap_v1/aligned_builder.h +++ /dev/null @@ -1,149 +0,0 @@ -/** - * Copyright (C) 2009-2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/base/string_data.h" -#include "mongo/bson/util/builder.h" - -namespace mongo { - -/** a page-aligned BufBuilder. */ -class AlignedBuilder { -public: - AlignedBuilder(unsigned init_size); - ~AlignedBuilder() { - kill(); - } - - /** reset with a hint as to the upcoming needed size specified */ - void reset(unsigned sz); - - /** reset for a re-use. shrinks if > 128MB */ - void reset(); - - /** note this may be deallocated (realloced) if you keep writing or reset(). */ - const char* buf() const { - return _p._data; - } - - /** leave room for some stuff later - @return offset in the buffer that was our current position - */ - size_t skip(unsigned n) { - unsigned l = len(); - grow(n); - return l; - } - - /** if buffer grows pointer no longer valid */ - char* atOfs(unsigned ofs) { - return _p._data + ofs; - } - - /** if buffer grows pointer no longer valid */ - char* cur() { - return _p._data + _len; - } - - void appendChar(char j) { - *((char*)grow(sizeof(char))) = j; - } - void appendNum(char j) { - *((char*)grow(sizeof(char))) = j; - } - void appendNum(short j) { - *((short*)grow(sizeof(short))) = j; - } - void appendNum(int j) { - *((int*)grow(sizeof(int))) = j; - } - void appendNum(unsigned j) { - *((unsigned*)grow(sizeof(unsigned))) = j; - } - void appendNum(bool j) { - *((bool*)grow(sizeof(bool))) = j; - } - void appendNum(double j) { - *((double*)grow(sizeof(double))) = j; - } - void appendNum(long long j) { - *((long long*)grow(sizeof(long long))) = j; - } - void appendNum(unsigned long long j) { - *((unsigned long long*)grow(sizeof(unsigned long long))) = j; - } - - void appendBuf(const void* src, size_t len) { - memcpy(grow((unsigned)len), src, len); - } - - template <class T> - void appendStruct(const T& s) { - appendBuf(&s, sizeof(T)); - } - - void appendStr(StringData str, bool includeEOO = true) { - const unsigned len = str.size() + (includeEOO ? 1 : 0); - verify(len < (unsigned)BSONObjMaxUserSize); - str.copyTo(grow(len), includeEOO); - } - - /** @return the in-use length */ - unsigned len() const { - return _len; - } - -private: - static const unsigned Alignment = 8192; - - /** returns the pre-grow write position */ - inline char* grow(unsigned by) { - unsigned oldlen = _len; - _len += by; - if (MONGO_unlikely(_len > _p._size)) { - growReallocate(oldlen); - } - return _p._data + oldlen; - } - - void growReallocate(unsigned oldLenInUse); - void kill(); - void mallocSelfAligned(unsigned sz); - void _malloc(unsigned sz); - void _realloc(unsigned newSize, unsigned oldLenInUse); - void _free(void*); - - struct AllocationInfo { - char* _data; - void* _allocationAddress; - unsigned _size; - } _p; - unsigned _len; // bytes in use -}; -} diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp deleted file mode 100644 index 14a3e57503b..00000000000 --- a/src/mongo/db/storage/mmap_v1/btree/btree_interface.cpp +++ /dev/null @@ -1,437 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/platform/basic.h" - -#include <string> - -#include "mongo/db/storage/mmap_v1/btree/btree_interface.h" - -#include "mongo/db/operation_context.h" -#include "mongo/db/storage/mmap_v1/btree/btree_logic.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_base.h" -#include "mongo/db/storage/sorted_data_interface.h" -#include "mongo/stdx/memory.h" -#include "mongo/util/assert_util.h" - -namespace mongo { -namespace { - -using std::unique_ptr; -using std::string; -using std::vector; - -using IndexVersion = IndexDescriptor::IndexVersion; - -template <class OnDiskFormat> -class BtreeBuilderInterfaceImpl final : public SortedDataBuilderInterface { -public: - BtreeBuilderInterfaceImpl(OperationContext* trans, - typename BtreeLogic<OnDiskFormat>::Builder* builder) - : _builder(builder), _trans(trans) {} - - Status addKey(const BSONObj& key, const RecordId& loc) { - return _builder->addKey(key, DiskLoc::fromRecordId(loc)); - } - -private: - std::unique_ptr<typename BtreeLogic<OnDiskFormat>::Builder> _builder; - - // Not owned here. - OperationContext* _trans; -}; - -template <class OnDiskFormat> -class BtreeInterfaceImpl final : public SortedDataInterface { -public: - BtreeInterfaceImpl(HeadManager* headManager, - RecordStore* recordStore, - SavedCursorRegistry* cursorRegistry, - const Ordering& ordering, - const string& indexName, - bool isUnique) { - _btree.reset(new BtreeLogic<OnDiskFormat>( - headManager, recordStore, cursorRegistry, ordering, indexName, isUnique)); - } - - virtual ~BtreeInterfaceImpl() {} - - virtual SortedDataBuilderInterface* getBulkBuilder(OperationContext* opCtx, bool dupsAllowed) { - return new BtreeBuilderInterfaceImpl<OnDiskFormat>(opCtx, - _btree->newBuilder(opCtx, dupsAllowed)); - } - - virtual Status insert(OperationContext* opCtx, - const BSONObj& key, - const RecordId& loc, - bool dupsAllowed) { - return _btree->insert(opCtx, key, DiskLoc::fromRecordId(loc), dupsAllowed); - } - - virtual void unindex(OperationContext* opCtx, - const BSONObj& key, - const RecordId& loc, - bool dupsAllowed) { - _btree->unindex(opCtx, key, DiskLoc::fromRecordId(loc)); - } - - virtual void fullValidate(OperationContext* opCtx, - long long* numKeysOut, - ValidateResults* fullResults) const { - *numKeysOut = _btree->fullValidate(opCtx, NULL, false, false, 0); - } - - virtual bool appendCustomStats(OperationContext* opCtx, - BSONObjBuilder* output, - double scale) const { - return false; - } - - virtual long long getSpaceUsedBytes(OperationContext* opCtx) const { - return _btree->getRecordStore()->dataSize(opCtx); - } - - virtual Status dupKeyCheck(OperationContext* opCtx, const BSONObj& key, const RecordId& loc) { - return _btree->dupKeyCheck(opCtx, key, DiskLoc::fromRecordId(loc)); - } - - virtual bool isEmpty(OperationContext* opCtx) { - return _btree->isEmpty(opCtx); - } - - virtual Status touch(OperationContext* opCtx) const { - return _btree->touch(opCtx); - } - - class Cursor final : public SortedDataInterface::Cursor { - public: - Cursor(OperationContext* opCtx, const BtreeLogic<OnDiskFormat>* btree, bool forward) - : _opCtx(opCtx), _btree(btree), _direction(forward ? 1 : -1), _ofs(0) {} - - boost::optional<IndexKeyEntry> next(RequestedInfo parts) override { - if (isEOF()) - return {}; - if (_lastMoveWasRestore) { - // Return current position rather than advancing. - _lastMoveWasRestore = false; - } else { - _btree->advance(_opCtx, &_bucket, &_ofs, _direction); - } - - if (atEndPoint()) - markEOF(); - return curr(parts); - } - - void setEndPosition(const BSONObj& key, bool inclusive) override { - if (key.isEmpty()) { - // This means scan to end of index. - _endState = boost::none; - return; - } - - _endState = {{key, inclusive}}; - seekEndCursor(); // Completes initialization of _endState. - } - - boost::optional<IndexKeyEntry> seek(const BSONObj& key, - bool inclusive, - RequestedInfo parts) override { - locate(key, inclusive == forward() ? RecordId::min() : RecordId::max()); - _lastMoveWasRestore = false; - - if (isEOF()) - return {}; - dassert(inclusive ? compareKeys(getKey(), key) >= 0 : compareKeys(getKey(), key) > 0); - return curr(parts); - } - - - boost::optional<IndexKeyEntry> seek(const IndexSeekPoint& seekPoint, - RequestedInfo parts) override { - bool canUseAdvanceTo = false; - if (!isEOF()) { - int cmp = _btree->customBSONCmp(getKey(), seekPoint, _direction); - - // advanceTo requires that we are positioned "earlier" in the index than the - // seek point, in scan order. - canUseAdvanceTo = forward() ? cmp < 0 : cmp > 0; - } - - - if (canUseAdvanceTo) { - // This takes advantage of current location. - _btree->advanceTo(_opCtx, &_bucket, &_ofs, seekPoint, _direction); - } else { - // Start at root. - _bucket = _btree->getHead(_opCtx); - _ofs = 0; - _btree->customLocate(_opCtx, &_bucket, &_ofs, seekPoint, _direction); - } - - _lastMoveWasRestore = false; - - if (atOrPastEndPointAfterSeeking()) - markEOF(); - return curr(parts); - } - - void save() override { - if (!_lastMoveWasRestore) - _savedEOF = isEOF(); - - if (!isEOF()) { - _saved.bucket = _bucket; - _btree->savedCursors()->registerCursor(&_saved); - // Don't want to change saved position if we only moved during restore. - if (!_lastMoveWasRestore) { - _saved.key = getKey().getOwned(); - _saved.loc = getDiskLoc(); - } - } - // Doing nothing with end cursor since it will do full reseek on restore. - } - - void saveUnpositioned() override { - // Don't leak our registration if save() was previously called. - if (!_saved.bucket.isNull()) - _btree->savedCursors()->unregisterCursor(&_saved); - - _saved.bucket = DiskLoc(); - _savedEOF = true; - } - - void restore() override { - // Always do a full seek on restore. We cannot use our last position since index - // entries may have been inserted closer to our endpoint and we would need to move - // over them. - seekEndCursor(); - - if (_savedEOF) { - markEOF(); - return; - } - - if (_btree->savedCursors()->unregisterCursor(&_saved)) { - // We can use the fast restore mechanism. - _btree->restorePosition( - _opCtx, _saved.key, _saved.loc, _direction, &_bucket, &_ofs); - } else { - // Need to find our position from the root. - locate(_saved.key, _saved.loc.toRecordId()); - } - - _lastMoveWasRestore = isEOF() // We weren't EOF but now are. - || (!_btree->isUnique() && getDiskLoc() != _saved.loc) || - compareKeys(getKey(), _saved.key) != 0; - } - - void detachFromOperationContext() final { - _opCtx = nullptr; - } - - void reattachToOperationContext(OperationContext* opCtx) final { - _opCtx = opCtx; - } - - private: - bool isEOF() const { - return _bucket.isNull(); - } - void markEOF() { - _bucket = DiskLoc(); - } - - boost::optional<IndexKeyEntry> curr(RequestedInfo parts) { - if (isEOF()) - return {}; - return {{(parts & kWantKey) ? getKey() : BSONObj(), - (parts & kWantLoc) ? getDiskLoc().toRecordId() : RecordId()}}; - } - - bool atEndPoint() const { - return _endState && _bucket == _endState->bucket && (isEOF() || _ofs == _endState->ofs); - } - - bool atOrPastEndPointAfterSeeking() const { - if (!_endState) - return false; - if (isEOF()) - return true; - - int cmp = compareKeys(getKey(), _endState->key); - return _endState->inclusive ? cmp > 0 : cmp >= 0; - } - - void locate(const BSONObj& key, const RecordId& loc) { - _btree->locate(_opCtx, key, DiskLoc::fromRecordId(loc), _direction, &_ofs, &_bucket); - if (atOrPastEndPointAfterSeeking()) - markEOF(); - } - - // Returns comparison relative to direction of scan. If rhs would be seen later, returns - // a positive value. - int compareKeys(const BSONObj& lhs, const BSONObj& rhs) const { - int cmp = lhs.woCompare(rhs, _btree->ordering(), /*considerFieldName*/ false); - return forward() ? cmp : -cmp; - } - - BSONObj getKey() const { - return _btree->getKey(_opCtx, _bucket, _ofs); - } - DiskLoc getDiskLoc() const { - return _btree->getDiskLoc(_opCtx, _bucket, _ofs); - } - - void seekEndCursor() { - if (!_endState) - return; - _btree->locate(_opCtx, - _endState->key, - forward() == _endState->inclusive ? DiskLoc::max() : DiskLoc::min(), - _direction, - &_endState->ofs, - &_endState->bucket); // pure out params. - } - - bool forward() const { - return _direction == 1; - } - - OperationContext* _opCtx; // not owned - const BtreeLogic<OnDiskFormat>* const _btree; - const int _direction; - - DiskLoc _bucket; - int _ofs; - - struct EndState { - BSONObj key; - bool inclusive; - DiskLoc bucket; - int ofs; - }; - boost::optional<EndState> _endState; - - // Used by next to decide to return current position rather than moving. Should be reset - // to false by any operation that moves the cursor, other than subsequent save/restore - // pairs. - bool _lastMoveWasRestore = false; - - // Only used by save/restore() if _bucket is non-Null. - bool _savedEOF = false; - SavedCursorRegistry::SavedCursor _saved; - }; - - virtual std::unique_ptr<SortedDataInterface::Cursor> newCursor(OperationContext* opCtx, - bool isForward = true) const { - return stdx::make_unique<Cursor>(opCtx, _btree.get(), isForward); - } - - class RandomCursor final : public SortedDataInterface::Cursor { - public: - RandomCursor(OperationContext* opCtx, const BtreeLogic<OnDiskFormat>* btree) - : _opCtx(opCtx), _btree(btree) {} - - boost::optional<IndexKeyEntry> next(RequestedInfo parts) override { - if (_btree->isEmpty(_opCtx)) { - return {}; - } - return _btree->getRandomEntry(_opCtx); - } - - void detachFromOperationContext() final { - _opCtx = nullptr; - } - - void reattachToOperationContext(OperationContext* opCtx) final { - _opCtx = opCtx; - } - - // - // Should never be called. - // - void setEndPosition(const BSONObj& key, bool inclusive) override { - MONGO_UNREACHABLE; - } - boost::optional<IndexKeyEntry> seek(const BSONObj& key, - bool inclusive, - RequestedInfo parts) override { - MONGO_UNREACHABLE; - } - boost::optional<IndexKeyEntry> seek(const IndexSeekPoint& seekPoint, - RequestedInfo parts) override { - MONGO_UNREACHABLE; - } - - // - // May be called, but are no-ops. - // - void save() override {} - void saveUnpositioned() override {} - void restore() override {} - - private: - OperationContext* _opCtx; - const BtreeLogic<OnDiskFormat>* const _btree; - }; - - virtual std::unique_ptr<SortedDataInterface::Cursor> newRandomCursor( - OperationContext* opCtx) const { - return stdx::make_unique<RandomCursor>(opCtx, _btree.get()); - } - - virtual Status initAsEmpty(OperationContext* opCtx) { - return _btree->initAsEmpty(opCtx); - } - -private: - unique_ptr<BtreeLogic<OnDiskFormat>> _btree; -}; -} // namespace - -SortedDataInterface* getMMAPV1Interface(HeadManager* headManager, - RecordStore* recordStore, - SavedCursorRegistry* cursorRegistry, - const Ordering& ordering, - const string& indexName, - IndexVersion version, - bool isUnique) { - switch (version) { - case IndexVersion::kV0: - return new BtreeInterfaceImpl<BtreeLayoutV0>( - headManager, recordStore, cursorRegistry, ordering, indexName, isUnique); - case IndexVersion::kV1: - case IndexVersion::kV2: - return new BtreeInterfaceImpl<BtreeLayoutV1>( - headManager, recordStore, cursorRegistry, ordering, indexName, isUnique); - } - MONGO_UNREACHABLE; -} - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_interface.h b/src/mongo/db/storage/mmap_v1/btree/btree_interface.h deleted file mode 100644 index ca61f2cbd28..00000000000 --- a/src/mongo/db/storage/mmap_v1/btree/btree_interface.h +++ /dev/null @@ -1,52 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include <string> - -#include "mongo/bson/ordering.h" -#include "mongo/db/catalog/head_manager.h" -#include "mongo/db/index/index_descriptor.h" -#include "mongo/db/jsobj.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/storage/mmap_v1/diskloc.h" -#include "mongo/db/storage/record_store.h" -#include "mongo/db/storage/sorted_data_interface.h" - -#pragma once - -namespace mongo { -class SavedCursorRegistry; - -SortedDataInterface* getMMAPV1Interface(HeadManager* headManager, - RecordStore* recordStore, - SavedCursorRegistry* cursorRegistry, - const Ordering& ordering, - const std::string& indexName, - IndexDescriptor::IndexVersion version, - bool isUnique); -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_interface_test.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_interface_test.cpp deleted file mode 100644 index b49fd70ec26..00000000000 --- a/src/mongo/db/storage/mmap_v1/btree/btree_interface_test.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/storage/mmap_v1/btree/btree_interface.h" - -#include "mongo/base/init.h" -#include "mongo/db/index/index_descriptor.h" -#include "mongo/db/storage/mmap_v1/btree/btree_test_help.h" -#include "mongo/db/storage/sorted_data_interface_test_harness.h" -#include "mongo/stdx/memory.h" -#include "mongo/unittest/unittest.h" - -namespace mongo { -namespace { - -using std::unique_ptr; - -class MyHarnessHelper final : public SortedDataInterfaceHarnessHelper { -public: - MyHarnessHelper() : _recordStore("a.b"), _order(Ordering::make(BSONObj())) {} - - std::unique_ptr<SortedDataInterface> newSortedDataInterface(bool unique) final { - std::unique_ptr<SortedDataInterface> sorted( - getMMAPV1Interface(&_headManager, - &_recordStore, - &_cursorRegistry, - _order, - "a_1", // indexName - IndexDescriptor::IndexVersion::kV1, - unique)); - OperationContextNoop op; - massertStatusOK(sorted->initAsEmpty(&op)); - return sorted; - } - - std::unique_ptr<RecoveryUnit> newRecoveryUnit() final { - return stdx::make_unique<HeapRecordStoreBtreeRecoveryUnit>(); - } - -private: - TestHeadManager _headManager; - HeapRecordStoreBtree _recordStore; - SavedCursorRegistry _cursorRegistry; - Ordering _order; -}; - -std::unique_ptr<HarnessHelper> makeHarnessHelper() { - return stdx::make_unique<MyHarnessHelper>(); -} - -MONGO_INITIALIZER(RegisterHarnessFactory)(InitializerContext* const) { - mongo::registerHarnessHelperFactory(makeHarnessHelper); - return Status::OK(); -} -} // namespace -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp deleted file mode 100644 index dc0e4aa83e2..00000000000 --- a/src/mongo/db/storage/mmap_v1/btree/btree_logic.cpp +++ /dev/null @@ -1,2440 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kIndex - -#include "mongo/platform/basic.h" - -#include <mutex> -#include <numeric> - -#include "mongo/db/client.h" -#include "mongo/db/jsobj.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/storage/mmap_v1/btree/btree_logic.h" -#include "mongo/db/storage/mmap_v1/btree/key.h" -#include "mongo/db/storage/mmap_v1/diskloc.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_base.h" -#include "mongo/db/storage/record_store.h" -#include "mongo/util/log.h" -#include "mongo/util/mongoutils/str.h" - -namespace mongo { - -using std::unique_ptr; -using std::dec; -using std::endl; -using std::hex; -using std::make_pair; -using std::pair; -using std::string; -using std::stringstream; -using std::vector; - -// BtreeLogic::Builder algorithm -// -// Phase 1: -// Handled by caller. Extracts keys from raw documents and puts them in external sorter -// -// Phase 2 (the addKeys phase): -// Add all keys to buckets. When a bucket gets full, pop the highest key (setting the -// nextChild pointer of the bucket to the prevChild of the popped key), add the popped key to -// a parent bucket, and create a new right sibling bucket to add the new key to. If the parent -// bucket is full, this same operation is performed on the parent and all full ancestors. If -// we get to the root and it is full, a new root is created above the current root. When -// creating a new right sibling, it is set as its parent's nextChild as all keys in the right -// sibling will be higher than all keys currently in the parent. - -namespace { -std::once_flag assertValidFlag; -} // namespace - -// -// Public Builder logic -// - -template <class BtreeLayout> -typename BtreeLogic<BtreeLayout>::Builder* BtreeLogic<BtreeLayout>::newBuilder( - OperationContext* opCtx, bool dupsAllowed) { - return new Builder(this, opCtx, dupsAllowed); -} - -template <class BtreeLayout> -BtreeLogic<BtreeLayout>::Builder::Builder(BtreeLogic* logic, - OperationContext* opCtx, - bool dupsAllowed) - : _logic(logic), _dupsAllowed(dupsAllowed), _opCtx(opCtx) { - // The normal bulk building path calls initAsEmpty, so we already have an empty root bucket. - // This isn't the case in some unit tests that use the Builder directly rather than going - // through an IndexAccessMethod. - _rightLeafLoc = DiskLoc::fromRecordId(_logic->_headManager->getHead(opCtx)); - if (_rightLeafLoc.isNull()) { - _rightLeafLoc = _logic->_addBucket(opCtx); - _logic->_headManager->setHead(_opCtx, _rightLeafLoc.toRecordId()); - } - - // must be empty when starting - invariant(_getBucket(_rightLeafLoc)->n == 0); -} - -template <class BtreeLayout> -class BtreeLogic<BtreeLayout>::Builder::SetRightLeafLocChange : public RecoveryUnit::Change { -public: - SetRightLeafLocChange(Builder* builder, DiskLoc oldLoc) : _builder(builder), _oldLoc(oldLoc) {} - - virtual void commit(boost::optional<Timestamp>) {} - virtual void rollback() { - _builder->_rightLeafLoc = _oldLoc; - } - - Builder* _builder; - const DiskLoc _oldLoc; -}; - -template <class BtreeLayout> -Status BtreeLogic<BtreeLayout>::Builder::addKey(const BSONObj& keyObj, const DiskLoc& loc) { - unique_ptr<KeyDataOwnedType> key(new KeyDataOwnedType(keyObj)); - - if (key->dataSize() > BtreeLayout::KeyMax) { - string msg = str::stream() << "Btree::insert: key too large to index, failing " - << _logic->_indexName << ' ' << key->dataSize() << ' ' - << key->toString(); - log() << msg << endl; - return Status(ErrorCodes::KeyTooLong, msg); - } - - // If we have a previous key to compare to... - if (_keyLast.get()) { - int cmp = _keyLast->woCompare(*key, _logic->_ordering); - - // This shouldn't happen ever. We expect keys in sorted order. - if (cmp > 0) { - return Status(ErrorCodes::InternalError, "Bad key order in btree builder"); - } - - // This could easily happen.. - if (!_dupsAllowed && (cmp == 0)) { - return Status(ErrorCodes::DuplicateKey, _logic->dupKeyError(*_keyLast)); - } - } - - BucketType* rightLeaf = _getModifiableBucket(_rightLeafLoc); - if (!_logic->pushBack(rightLeaf, loc, *key, DiskLoc())) { - // bucket was full, so split and try with the new node. - _opCtx->recoveryUnit()->registerChange(new SetRightLeafLocChange(this, _rightLeafLoc)); - _rightLeafLoc = newBucket(rightLeaf, _rightLeafLoc); - rightLeaf = _getModifiableBucket(_rightLeafLoc); - invariant(_logic->pushBack(rightLeaf, loc, *key, DiskLoc())); - } - - _keyLast = std::move(key); - return Status::OK(); -} - -// -// Private Builder logic -// - -template <class BtreeLayout> -DiskLoc BtreeLogic<BtreeLayout>::Builder::newBucket(BucketType* leftSib, DiskLoc leftSibLoc) { - invariant(leftSib->n >= 2); // Guaranteed by sufficiently small KeyMax. - - if (leftSib->parent.isNull()) { - // Making a new root - invariant(leftSibLoc.toRecordId() == _logic->_headManager->getHead(_opCtx)); - const DiskLoc newRootLoc = _logic->_addBucket(_opCtx); - leftSib->parent = newRootLoc; - _logic->_headManager->setHead(_opCtx, newRootLoc.toRecordId()); - - // Set the newRoot's nextChild to point to leftSib for the invariant below. - BucketType* newRoot = _getBucket(newRootLoc); - *_opCtx->recoveryUnit()->writing(&newRoot->nextChild) = leftSibLoc; - } - - DiskLoc parentLoc = leftSib->parent; - BucketType* parent = _getModifiableBucket(parentLoc); - - // For the pushBack below to be correct, leftSib must be the right-most child of parent. - invariant(parent->nextChild == leftSibLoc); - - // Pull right-most key out of leftSib and move to parent, splitting parent if necessary. - // Note that popBack() handles setting leftSib's nextChild to the former prevChildNode of - // the popped key. - KeyDataType key; - DiskLoc val; - _logic->popBack(leftSib, &val, &key); - if (!_logic->pushBack(parent, val, key, leftSibLoc)) { - // parent is full, so split it. - parentLoc = newBucket(parent, parentLoc); - parent = _getModifiableBucket(parentLoc); - invariant(_logic->pushBack(parent, val, key, leftSibLoc)); - leftSib->parent = parentLoc; - } - - // Create a new bucket to the right of leftSib and set its parent pointer and the downward - // nextChild pointer from the parent. - DiskLoc newBucketLoc = _logic->_addBucket(_opCtx); - BucketType* newBucket = _getBucket(newBucketLoc); - *_opCtx->recoveryUnit()->writing(&newBucket->parent) = parentLoc; - *_opCtx->recoveryUnit()->writing(&parent->nextChild) = newBucketLoc; - return newBucketLoc; -} - -template <class BtreeLayout> -typename BtreeLogic<BtreeLayout>::BucketType* -BtreeLogic<BtreeLayout>::Builder::_getModifiableBucket(DiskLoc loc) { - return _logic->btreemod(_opCtx, _logic->getBucket(_opCtx, loc)); -} - -template <class BtreeLayout> -typename BtreeLogic<BtreeLayout>::BucketType* BtreeLogic<BtreeLayout>::Builder::_getBucket( - DiskLoc loc) { - return _logic->getBucket(_opCtx, loc); -} - -// -// BtreeLogic logic -// - -// static -template <class BtreeLayout> -typename BtreeLogic<BtreeLayout>::FullKey BtreeLogic<BtreeLayout>::getFullKey( - const BucketType* bucket, int i) { - if (i >= bucket->n) { - int code = 13000; - massert(code, - (string) "invalid keyNode: " + BSON("i" << i << "n" << bucket->n).jsonString(), - i < bucket->n); - } - return FullKey(bucket, i); -} - -// static -template <class BtreeLayout> -typename BtreeLogic<BtreeLayout>::KeyHeaderType& BtreeLogic<BtreeLayout>::getKeyHeader( - BucketType* bucket, int i) { - return ((KeyHeaderType*)bucket->data)[i]; -} - -// static -template <class BtreeLayout> -const typename BtreeLogic<BtreeLayout>::KeyHeaderType& BtreeLogic<BtreeLayout>::getKeyHeader( - const BucketType* bucket, int i) { - return ((const KeyHeaderType*)bucket->data)[i]; -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::markUnused(BucketType* bucket, int keyPos) { - invariant(keyPos >= 0 && keyPos < bucket->n); - getKeyHeader(bucket, keyPos).setUnused(); -} - -template <class BtreeLayout> -char* BtreeLogic<BtreeLayout>::dataAt(BucketType* bucket, short ofs) { - return bucket->data + ofs; -} - -template <class BtreeLayout> -typename BtreeLogic<BtreeLayout>::BucketType* BtreeLogic<BtreeLayout>::btreemod( - OperationContext* opCtx, BucketType* bucket) { - opCtx->recoveryUnit()->writingPtr(bucket, BtreeLayout::BucketSize); - return bucket; -} - -template <class BtreeLayout> -int BtreeLogic<BtreeLayout>::totalDataSize(BucketType* bucket) { - return (int)(BtreeLayout::BucketSize - (bucket->data - (char*)bucket)); -} - -// We define this value as the maximum number of bytes such that, if we have -// fewer than this many bytes, we must be able to either merge with or receive -// keys from any neighboring node. If our utilization goes below this value we -// know we can bring up the utilization with a simple operation. Ignoring the -// 90/10 split policy which is sometimes employed and our 'unused' nodes, this -// is a lower bound on bucket utilization for non root buckets. -// -// Note that the exact value here depends on the implementation of -// _rebalancedSeparatorPos(). The conditions for lowWaterMark - 1 are as -// follows: We know we cannot merge with the neighbor, so the total data size -// for us, the neighbor, and the separator must be at least -// BucketType::bodySize() + 1. We must be able to accept one key of any -// allowed size, so our size plus storage for that additional key must be -// <= BucketType::bodySize() / 2. This way, with the extra key we'll have a -// new bucket data size < half the total data size and by the implementation -// of _rebalancedSeparatorPos() the key must be added. -template <class BtreeLayout> -int BtreeLogic<BtreeLayout>::lowWaterMark() { - return BtreeLayout::BucketBodySize / 2 - BtreeLayout::KeyMax - sizeof(KeyHeaderType) + 1; -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::init(BucketType* bucket) { - BtreeLayout::initBucket(bucket); - bucket->parent.Null(); - bucket->nextChild.Null(); - bucket->flags = Packed; - bucket->n = 0; - bucket->emptySize = totalDataSize(bucket); - bucket->topSize = 0; -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::_unalloc(BucketType* bucket, int bytes) { - bucket->topSize -= bytes; - bucket->emptySize += bytes; -} - -/** - * We allocate space from the end of the buffer for data. The keynodes grow from the front. - */ -template <class BtreeLayout> -int BtreeLogic<BtreeLayout>::_alloc(BucketType* bucket, int bytes) { - invariant(bucket->emptySize >= bytes); - bucket->topSize += bytes; - bucket->emptySize -= bytes; - int ofs = totalDataSize(bucket) - bucket->topSize; - invariant(ofs > 0); - return ofs; -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::setNotPacked(BucketType* bucket) { - bucket->flags &= ~Packed; -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::setPacked(BucketType* bucket) { - bucket->flags |= Packed; -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::_delKeyAtPos(BucketType* bucket, int keypos, bool mayEmpty) { - invariant(keypos >= 0 && keypos <= bucket->n); - invariant(childLocForPos(bucket, keypos).isNull()); - invariant((mayEmpty && bucket->n > 0) || bucket->n > 1 || bucket->nextChild.isNull()); - - bucket->emptySize += sizeof(KeyHeaderType); - bucket->n--; - - for (int j = keypos; j < bucket->n; j++) { - getKeyHeader(bucket, j) = getKeyHeader(bucket, j + 1); - } - - setNotPacked(bucket); -} - -/** - * Pull rightmost key from the bucket and set its prevChild pointer to be the nextChild for the - * whole bucket. It is assumed that caller already has the old value of the nextChild - * pointer and is about to add a pointer to it elsewhere in the tree. - * - * This is only used by BtreeLogic::Builder. Think very hard (and change this comment) before - * using it anywhere else. - * - * WARNING: The keyDataOut that is filled out by this function points to newly unalloced memory - * inside of this bucket. It only remains valid until the next write to this bucket. - */ -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::popBack(BucketType* bucket, - DiskLoc* recordLocOut, - KeyDataType* keyDataOut) { - massert(17435, "n==0 in btree popBack()", bucket->n > 0); - - invariant(getKeyHeader(bucket, bucket->n - 1).isUsed()); - - FullKey kn = getFullKey(bucket, bucket->n - 1); - *recordLocOut = kn.recordLoc; - keyDataOut->assign(kn.data); - int keysize = kn.data.dataSize(); - - // The left/prev child of the node we are popping now goes in to the nextChild slot as all - // of its keys are greater than all remaining keys in this node. - bucket->nextChild = kn.prevChildBucket; - bucket->n--; - - // This is risky because the keyDataOut we filled out above will now point to this newly - // unalloced memory. - bucket->emptySize += sizeof(KeyHeaderType); - _unalloc(bucket, keysize); -} - -/** - * Add a key. Must be > all existing. Be careful to set next ptr right. - */ -template <class BtreeLayout> -bool BtreeLogic<BtreeLayout>::pushBack(BucketType* bucket, - const DiskLoc recordLoc, - const KeyDataType& key, - const DiskLoc prevChild) { - int bytesNeeded = key.dataSize() + sizeof(KeyHeaderType); - if (bytesNeeded > bucket->emptySize) { - return false; - } - invariant(bytesNeeded <= bucket->emptySize); - - if (bucket->n) { - const FullKey klast = getFullKey(bucket, bucket->n - 1); - if (klast.data.woCompare(key, _ordering) > 0) { - log() << "btree bucket corrupt? " - "consider reindexing or running validate command" - << endl; - log() << " klast: " << klast.data.toString() << endl; - log() << " key: " << key.toString() << endl; - MONGO_UNREACHABLE; - } - } - - bucket->emptySize -= sizeof(KeyHeaderType); - KeyHeaderType& kn = getKeyHeader(bucket, bucket->n++); - kn.prevChildBucket = prevChild; - kn.recordLoc = recordLoc; - kn.setKeyDataOfs((short)_alloc(bucket, key.dataSize())); - short ofs = kn.keyDataOfs(); - char* p = dataAt(bucket, ofs); - memcpy(p, key.data(), key.dataSize()); - return true; -} - -/** - * Durability note: - * - * We do separate intent declarations herein. Arguably one could just declare the whole bucket - * given we do group commits. This is something we could investigate later as to what is - * faster. - **/ - -/** - * Insert a key in a bucket with no complexity -- no splits required - * Returns false if a split is required. - */ -template <class BtreeLayout> -bool BtreeLogic<BtreeLayout>::basicInsert(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int& keypos, - const KeyDataType& key, - const DiskLoc recordLoc) { - invariant(bucket->n < 1024); - invariant(keypos >= 0 && keypos <= bucket->n); - - int bytesNeeded = key.dataSize() + sizeof(KeyHeaderType); - if (bytesNeeded > bucket->emptySize) { - _pack(opCtx, bucket, bucketLoc, keypos); - if (bytesNeeded > bucket->emptySize) { - return false; - } - } - - invariant(getBucket(opCtx, bucketLoc) == bucket); - - { - // declare that we will write to [k(keypos),k(n)] - char* start = reinterpret_cast<char*>(&getKeyHeader(bucket, keypos)); - char* end = reinterpret_cast<char*>(&getKeyHeader(bucket, bucket->n + 1)); - - // Declare that we will write to [k(keypos),k(n)] - opCtx->recoveryUnit()->writingPtr(start, end - start); - } - - // e.g. for n==3, keypos==2 - // 1 4 9 -> 1 4 _ 9 - for (int j = bucket->n; j > keypos; j--) { - getKeyHeader(bucket, j) = getKeyHeader(bucket, j - 1); - } - - size_t writeLen = sizeof(bucket->emptySize) + sizeof(bucket->topSize) + sizeof(bucket->n); - opCtx->recoveryUnit()->writingPtr(&bucket->emptySize, writeLen); - bucket->emptySize -= sizeof(KeyHeaderType); - bucket->n++; - - // This _KeyNode was marked for writing above. - KeyHeaderType& kn = getKeyHeader(bucket, keypos); - kn.prevChildBucket.Null(); - kn.recordLoc = recordLoc; - kn.setKeyDataOfs((short)_alloc(bucket, key.dataSize())); - char* p = dataAt(bucket, kn.keyDataOfs()); - opCtx->recoveryUnit()->writingPtr(p, key.dataSize()); - memcpy(p, key.data(), key.dataSize()); - return true; -} - -/** - * With this implementation, refPos == 0 disregards effect of refPos. index > 0 prevents - * creation of an empty bucket. - */ -template <class BtreeLayout> -bool BtreeLogic<BtreeLayout>::mayDropKey(BucketType* bucket, int index, int refPos) { - return index > 0 && (index != refPos) && getKeyHeader(bucket, index).isUnused() && - getKeyHeader(bucket, index).prevChildBucket.isNull(); -} - -template <class BtreeLayout> -int BtreeLogic<BtreeLayout>::_packedDataSize(BucketType* bucket, int refPos) { - if (bucket->flags & Packed) { - return BtreeLayout::BucketSize - bucket->emptySize - BucketType::HeaderSize; - } - - int size = 0; - for (int j = 0; j < bucket->n; ++j) { - if (mayDropKey(bucket, j, refPos)) { - continue; - } - size += getFullKey(bucket, j).data.dataSize() + sizeof(KeyHeaderType); - } - - return size; -} - -/** - * When we delete things, we just leave empty space until the node is full and then we repack - * it. - */ -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::_pack(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc thisLoc, - int& refPos) { - invariant(getBucket(opCtx, thisLoc) == bucket); - - if (bucket->flags & Packed) { - return; - } - - _packReadyForMod(btreemod(opCtx, bucket), refPos); -} - -/** - * Version when write intent already declared. - */ -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::_packReadyForMod(BucketType* bucket, int& refPos) { - if (bucket->flags & Packed) { - return; - } - - int tdz = totalDataSize(bucket); - char temp[BtreeLayout::BucketSize]; - int ofs = tdz; - bucket->topSize = 0; - - int i = 0; - for (int j = 0; j < bucket->n; j++) { - if (mayDropKey(bucket, j, refPos)) { - // key is unused and has no children - drop it - continue; - } - - if (i != j) { - if (refPos == j) { - // i < j so j will never be refPos again - refPos = i; - } - getKeyHeader(bucket, i) = getKeyHeader(bucket, j); - } - - short ofsold = getKeyHeader(bucket, i).keyDataOfs(); - int sz = getFullKey(bucket, i).data.dataSize(); - ofs -= sz; - bucket->topSize += sz; - memcpy(temp + ofs, dataAt(bucket, ofsold), sz); - getKeyHeader(bucket, i).setKeyDataOfsSavingUse(ofs); - ++i; - } - - if (refPos == bucket->n) { - refPos = i; - } - - bucket->n = i; - int dataUsed = tdz - ofs; - memcpy(bucket->data + ofs, temp + ofs, dataUsed); - - bucket->emptySize = tdz - dataUsed - bucket->n * sizeof(KeyHeaderType); - int foo = bucket->emptySize; - invariant(foo >= 0); - setPacked(bucket); - assertValid(_indexName, bucket, _ordering); -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::truncateTo(BucketType* bucket, int N, int& refPos) { - bucket->n = N; - setNotPacked(bucket); - _packReadyForMod(bucket, refPos); -} - -/** - * In the standard btree algorithm, we would split based on the - * existing keys _and_ the new key. But that's more work to - * implement, so we split the existing keys and then add the new key. - * - * There are several published heuristic algorithms for doing splits, but basically what you - * want are (1) even balancing between the two sides and (2) a small split key so the parent can - * have a larger branching factor. - * - * We just have a simple algorithm right now: if a key includes the halfway point (or 10% way - * point) in terms of bytes, split on that key; otherwise split on the key immediately to the - * left of the halfway point (or 10% point). - * - * This function is expected to be called on a packed bucket. - */ -template <class BtreeLayout> -int BtreeLogic<BtreeLayout>::splitPos(BucketType* bucket, int keypos) { - invariant(bucket->n > 2); - int split = 0; - int rightSize = 0; - - // When splitting a btree node, if the new key is greater than all the other keys, we should - // not do an even split, but a 90/10 split. see SERVER-983. TODO I think we only want to - // do the 90% split on the rhs node of the tree. - int rightSizeLimit = - (bucket->topSize + sizeof(KeyHeaderType) * bucket->n) / (keypos == bucket->n ? 10 : 2); - - for (int i = bucket->n - 1; i > -1; --i) { - rightSize += getFullKey(bucket, i).data.dataSize() + sizeof(KeyHeaderType); - if (rightSize > rightSizeLimit) { - split = i; - break; - } - } - - // safeguards - we must not create an empty bucket - if (split < 1) { - split = 1; - } else if (split > bucket->n - 2) { - split = bucket->n - 2; - } - - return split; -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::reserveKeysFront(BucketType* bucket, int nAdd) { - invariant(bucket->emptySize >= int(sizeof(KeyHeaderType) * nAdd)); - bucket->emptySize -= sizeof(KeyHeaderType) * nAdd; - for (int i = bucket->n - 1; i > -1; --i) { - getKeyHeader(bucket, i + nAdd) = getKeyHeader(bucket, i); - } - bucket->n += nAdd; -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::setKey(BucketType* bucket, - int i, - const DiskLoc recordLoc, - const KeyDataType& key, - const DiskLoc prevChildBucket) { - KeyHeaderType& kn = getKeyHeader(bucket, i); - kn.recordLoc = recordLoc; - kn.prevChildBucket = prevChildBucket; - short ofs = (short)_alloc(bucket, key.dataSize()); - kn.setKeyDataOfs(ofs); - char* p = dataAt(bucket, ofs); - memcpy(p, key.data(), key.dataSize()); -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::dropFront(BucketType* bucket, int nDrop, int& refpos) { - for (int i = nDrop; i < bucket->n; ++i) { - getKeyHeader(bucket, i - nDrop) = getKeyHeader(bucket, i); - } - bucket->n -= nDrop; - setNotPacked(bucket); - _packReadyForMod(bucket, refpos); -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::customLocate(OperationContext* opCtx, - DiskLoc* locInOut, - int* keyOfsInOut, - const IndexSeekPoint& seekPoint, - int direction) const { - pair<DiskLoc, int> unused; - - customLocate(opCtx, locInOut, keyOfsInOut, seekPoint, direction, unused); - skipUnusedKeys(opCtx, locInOut, keyOfsInOut, direction); -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::advance(OperationContext* opCtx, - DiskLoc* bucketLocInOut, - int* posInOut, - int direction) const { - *bucketLocInOut = advance(opCtx, *bucketLocInOut, posInOut, direction); - skipUnusedKeys(opCtx, bucketLocInOut, posInOut, direction); -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::skipUnusedKeys(OperationContext* opCtx, - DiskLoc* loc, - int* pos, - int direction) const { - while (!loc->isNull() && !keyIsUsed(opCtx, *loc, *pos)) { - *loc = advance(opCtx, *loc, pos, direction); - } -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::advanceTo(OperationContext* opCtx, - DiskLoc* thisLocInOut, - int* keyOfsInOut, - const IndexSeekPoint& seekPoint, - int direction) const { - advanceToImpl(opCtx, thisLocInOut, keyOfsInOut, seekPoint, direction); - skipUnusedKeys(opCtx, thisLocInOut, keyOfsInOut, direction); -} - -/** - * find smallest/biggest value greater-equal/less-equal than specified - * - * starting thisLoc + keyOfs will be strictly less than/strictly greater than - * keyBegin/keyBeginLen/keyEnd - * - * All the direction checks below allowed me to refactor the code, but possibly separate forward - * and reverse implementations would be more efficient - */ -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::advanceToImpl(OperationContext* opCtx, - DiskLoc* thisLocInOut, - int* keyOfsInOut, - const IndexSeekPoint& seekPoint, - int direction) const { - BucketType* bucket = getBucket(opCtx, *thisLocInOut); - - int l, h; - bool dontGoUp; - - if (direction > 0) { - l = *keyOfsInOut; - h = bucket->n - 1; - int cmpResult = customBSONCmp(getFullKey(bucket, h).data.toBson(), seekPoint, direction); - dontGoUp = (cmpResult >= 0); - } else { - l = 0; - h = *keyOfsInOut; - int cmpResult = customBSONCmp(getFullKey(bucket, l).data.toBson(), seekPoint, direction); - dontGoUp = (cmpResult <= 0); - } - - pair<DiskLoc, int> bestParent; - - if (dontGoUp) { - // this comparison result assures h > l - if (!customFind(opCtx, l, h, seekPoint, direction, thisLocInOut, keyOfsInOut, bestParent)) { - return; - } - } else { - // go up parents until rightmost/leftmost node is >=/<= target or at top - while (!bucket->parent.isNull()) { - *thisLocInOut = bucket->parent; - bucket = getBucket(opCtx, *thisLocInOut); - - if (direction > 0) { - if (customBSONCmp(getFullKey(bucket, bucket->n - 1).data.toBson(), - seekPoint, - direction) >= 0) { - break; - } - } else { - if (customBSONCmp(getFullKey(bucket, 0).data.toBson(), seekPoint, direction) <= 0) { - break; - } - } - } - } - - customLocate(opCtx, thisLocInOut, keyOfsInOut, seekPoint, direction, bestParent); -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::customLocate(OperationContext* opCtx, - DiskLoc* locInOut, - int* keyOfsInOut, - const IndexSeekPoint& seekPoint, - int direction, - pair<DiskLoc, int>& bestParent) const { - BucketType* bucket = getBucket(opCtx, *locInOut); - - if (0 == bucket->n) { - *locInOut = DiskLoc(); - return; - } - - // go down until find smallest/biggest >=/<= target - for (;;) { - int l = 0; - int h = bucket->n - 1; - - // +direction: 0, -direction: h - int z = (direction > 0) ? 0 : h; - - // leftmost/rightmost key may possibly be >=/<= search key - int res = customBSONCmp(getFullKey(bucket, z).data.toBson(), seekPoint, direction); - if (direction * res >= 0) { - DiskLoc next; - *keyOfsInOut = z; - - if (direction > 0) { - dassert(z == 0); - next = getKeyHeader(bucket, 0).prevChildBucket; - } else { - next = bucket->nextChild; - } - - if (!next.isNull()) { - bestParent = pair<DiskLoc, int>(*locInOut, *keyOfsInOut); - *locInOut = next; - bucket = getBucket(opCtx, *locInOut); - continue; - } else { - return; - } - } - - res = customBSONCmp(getFullKey(bucket, h - z).data.toBson(), seekPoint, direction); - if (direction * res < 0) { - DiskLoc next; - if (direction > 0) { - next = bucket->nextChild; - } else { - next = getKeyHeader(bucket, 0).prevChildBucket; - } - - if (next.isNull()) { - // if bestParent is null, we've hit the end and locInOut gets set to DiskLoc() - *locInOut = bestParent.first; - *keyOfsInOut = bestParent.second; - return; - } else { - *locInOut = next; - bucket = getBucket(opCtx, *locInOut); - continue; - } - } - - if (!customFind(opCtx, l, h, seekPoint, direction, locInOut, keyOfsInOut, bestParent)) { - return; - } - - bucket = getBucket(opCtx, *locInOut); - } -} - -template <class BtreeLayout> -bool BtreeLogic<BtreeLayout>::customFind(OperationContext* opCtx, - int low, - int high, - const IndexSeekPoint& seekPoint, - int direction, - DiskLoc* thisLocInOut, - int* keyOfsInOut, - pair<DiskLoc, int>& bestParent) const { - const BucketType* bucket = getBucket(opCtx, *thisLocInOut); - - for (;;) { - if (low + 1 == high) { - *keyOfsInOut = (direction > 0) ? high : low; - DiskLoc next = getKeyHeader(bucket, high).prevChildBucket; - if (!next.isNull()) { - bestParent = make_pair(*thisLocInOut, *keyOfsInOut); - *thisLocInOut = next; - return true; - } else { - return false; - } - } - - int middle = low + (high - low) / 2; - - int cmp = customBSONCmp(getFullKey(bucket, middle).data.toBson(), seekPoint, direction); - if (cmp < 0) { - low = middle; - } else if (cmp > 0) { - high = middle; - } else { - if (direction < 0) { - low = middle; - } else { - high = middle; - } - } - } -} - -/** - * NOTE: Currently the Ordering implementation assumes a compound index will not have more keys - * than an unsigned variable has bits. The same assumption is used in the implementation below - * with respect to the 'mask' variable. - * - * 'l' is a regular bsonobj - * - * 'rBegin' is composed partly of an existing bsonobj, and the remaining keys are taken from a - * vector of elements that frequently changes - * - * see https://jira.mongodb.org/browse/SERVER-371 - */ -// static -template <class BtreeLayout> -int BtreeLogic<BtreeLayout>::customBSONCmp(const BSONObj& left, - const IndexSeekPoint& right, - int direction) const { - // XXX: make this readable - dassert(right.keySuffix.size() == right.suffixInclusive.size()); - - BSONObjIterator ll(left); - BSONObjIterator rr(right.keyPrefix); - unsigned mask = 1; - size_t i = 0; - for (; i < size_t(right.prefixLen); ++i, mask <<= 1) { - BSONElement lll = ll.next(); - BSONElement rrr = rr.next(); - - int x = lll.woCompare(rrr, false); - if (_ordering.descending(mask)) - x = -x; - if (x != 0) - return x; - } - if (right.prefixExclusive) { - return -direction; - } - for (; i < right.keySuffix.size(); ++i, mask <<= 1) { - if (!ll.more()) - return -direction; - - BSONElement lll = ll.next(); - BSONElement rrr = *right.keySuffix[i]; - int x = lll.woCompare(rrr, false); - if (_ordering.descending(mask)) - x = -x; - if (x != 0) - return x; - if (!right.suffixInclusive[i]) { - return -direction; - } - } - return ll.more() ? direction : 0; -} - -template <class BtreeLayout> -bool BtreeLogic<BtreeLayout>::exists(OperationContext* opCtx, const KeyDataType& key) const { - int position = 0; - - // Find the DiskLoc - bool found; - - DiskLoc bucket = _locate(opCtx, getRootLoc(opCtx), key, &position, &found, DiskLoc::min(), 1); - - while (!bucket.isNull()) { - FullKey fullKey = getFullKey(getBucket(opCtx, bucket), position); - if (fullKey.header.isUsed()) { - return fullKey.data.woEqual(key); - } - bucket = advance(opCtx, bucket, &position, 1); - } - - return false; -} - -template <class BtreeLayout> -Status BtreeLogic<BtreeLayout>::dupKeyCheck(OperationContext* opCtx, - const BSONObj& key, - const DiskLoc& loc) const { - KeyDataOwnedType theKey(key); - if (!wouldCreateDup(opCtx, theKey, loc)) { - return Status::OK(); - } - - return Status(ErrorCodes::DuplicateKey, dupKeyError(theKey)); -} - -template <class BtreeLayout> -bool BtreeLogic<BtreeLayout>::wouldCreateDup(OperationContext* opCtx, - const KeyDataType& key, - const DiskLoc self) const { - int position; - bool found; - - DiskLoc posLoc = _locate(opCtx, getRootLoc(opCtx), key, &position, &found, DiskLoc::min(), 1); - - while (!posLoc.isNull()) { - FullKey fullKey = getFullKey(getBucket(opCtx, posLoc), position); - if (fullKey.header.isUsed()) { - // TODO: we may not need fullKey.data until we know fullKey.header.isUsed() here - // and elsewhere. - if (fullKey.data.woEqual(key)) { - return fullKey.recordLoc != self; - } - break; - } - - posLoc = advance(opCtx, posLoc, &position, 1); - } - return false; -} - -template <class BtreeLayout> -string BtreeLogic<BtreeLayout>::dupKeyError(const KeyDataType& key) const { - stringstream ss; - ss << "E11000 duplicate key error "; - ss << "index: " << _indexName << " "; - ss << "dup key: " << key.toString(); - return ss.str(); -} - -/** - * Find a key within this btree bucket. - * - * When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the - * key. That assures that even when there are many duplicates (e.g., 1 million) for a key, our - * performance is still good. - * - * assertIfDup: if the key exists (ignoring the recordLoc), uassert - * - * pos: for existing keys k0...kn-1. - * returns # it goes BEFORE. so key[pos-1] < key < key[pos] - * returns n if it goes after the last existing key. - * note result might be an Unused location! - */ -template <class BtreeLayout> -Status BtreeLogic<BtreeLayout>::_find(OperationContext* opCtx, - BucketType* bucket, - const KeyDataType& key, - const DiskLoc& recordLoc, - bool errorIfDup, - int* keyPositionOut, - bool* foundOut) const { - // XXX: fix the ctor for DiskLoc56bit so we can just convert w/o assignment operator - LocType genericRecordLoc; - genericRecordLoc = recordLoc; - - bool dupsCheckedYet = false; - - int low = 0; - int high = bucket->n - 1; - int middle = (low + high) / 2; - - while (low <= high) { - FullKey fullKey = getFullKey(bucket, middle); - int cmp = key.woCompare(fullKey.data, _ordering); - - // The key data is the same. - if (0 == cmp) { - // Found the key in this bucket. If we're checking for dups... - if (errorIfDup) { - if (fullKey.header.isUnused()) { - // It's ok that the key is there if it is unused. We need to check that - // there aren't other entries for the key then. as it is very rare that - // we get here, we don't put any coding effort in here to make this - // particularly fast - if (!dupsCheckedYet) { - // This is expensive and we only want to do it once(? -- when would - // it happen twice). - dupsCheckedYet = true; - if (exists(opCtx, key)) { - if (wouldCreateDup(opCtx, key, genericRecordLoc)) { - return Status(ErrorCodes::DuplicateKey, dupKeyError(key)); - } else { - return Status(ErrorCodes::DuplicateKeyValue, - "key/value already in index"); - } - } - } - } else { - if (fullKey.recordLoc == recordLoc) { - return Status(ErrorCodes::DuplicateKeyValue, "key/value already in index"); - } else { - return Status(ErrorCodes::DuplicateKey, dupKeyError(key)); - } - } - } - - // If we're here dup keys are allowed, or the key is a dup but unused. - LocType recordLocCopy = fullKey.recordLoc; - - // We clear this bit so we can test equality without the used bit messing us up. - // XXX: document this - // XXX: kill this GETOFS stuff - recordLocCopy.GETOFS() &= ~1; - - // Set 'cmp' to the comparison w/the DiskLoc and fall through below. - cmp = recordLoc.compare(recordLocCopy); - } - - if (cmp < 0) { - high = middle - 1; - } else if (cmp > 0) { - low = middle + 1; - } else { - // Found it! - *keyPositionOut = middle; - *foundOut = true; - return Status::OK(); - } - - middle = (low + high) / 2; - } - - // Not found. - *keyPositionOut = low; - - // Some debugging checks. - if (low != bucket->n) { - invariant(key.woCompare(getFullKey(bucket, low).data, _ordering) <= 0); - - if (low > 0) { - if (getFullKey(bucket, low - 1).data.woCompare(key, _ordering) > 0) { - DEV { - log() << key.toString() << endl; - log() << getFullKey(bucket, low - 1).data.toString() << endl; - } - MONGO_UNREACHABLE; - } - } - } - - *foundOut = false; - return Status::OK(); -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::delBucket(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc) { - invariant(bucketLoc != getRootLoc(opCtx)); - - _cursorRegistry->invalidateCursorsForBucket(bucketLoc); - - BucketType* p = getBucket(opCtx, bucket->parent); - int parentIdx = indexInParent(opCtx, bucket, bucketLoc); - *opCtx->recoveryUnit()->writing(&childLocForPos(p, parentIdx)) = DiskLoc(); - deallocBucket(opCtx, bucket, bucketLoc); -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::deallocBucket(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc) { - bucket->n = BtreeLayout::INVALID_N_SENTINEL; - bucket->parent.Null(); - _recordStore->deleteRecord(opCtx, bucketLoc.toRecordId()); -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::restorePosition(OperationContext* opCtx, - const BSONObj& savedKey, - const DiskLoc& savedLoc, - int direction, - DiskLoc* bucketLocInOut, - int* keyOffsetInOut) const { - // The caller has to ensure validity of the saved cursor using the SavedCursorRegistry - BucketType* bucket = getBucket(opCtx, *bucketLocInOut); - invariant(bucket); - invariant(BtreeLayout::INVALID_N_SENTINEL != bucket->n); - - if (_keyIsAt(savedKey, savedLoc, bucket, *keyOffsetInOut)) { - skipUnusedKeys(opCtx, bucketLocInOut, keyOffsetInOut, direction); - return; - } - - if (*keyOffsetInOut > 0) { - (*keyOffsetInOut)--; - if (_keyIsAt(savedKey, savedLoc, bucket, *keyOffsetInOut)) { - skipUnusedKeys(opCtx, bucketLocInOut, keyOffsetInOut, direction); - return; - } - } - - locate(opCtx, savedKey, savedLoc, direction, keyOffsetInOut, bucketLocInOut); -} - -template <class BtreeLayout> -bool BtreeLogic<BtreeLayout>::_keyIsAt(const BSONObj& savedKey, - const DiskLoc& savedLoc, - BucketType* bucket, - int keyPos) const { - if (keyPos >= bucket->n) { - return false; - } - - FullKey key = getFullKey(bucket, keyPos); - if (!key.data.toBson().binaryEqual(savedKey)) { - return false; - } - return key.header.recordLoc == savedLoc; -} - -/** - * May delete the bucket 'bucket' rendering 'bucketLoc' invalid. - */ -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::delKeyAtPos(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int p) { - invariant(bucket->n > 0); - DiskLoc left = childLocForPos(bucket, p); - if (bucket->n == 1) { - if (left.isNull() && bucket->nextChild.isNull()) { - _delKeyAtPos(bucket, p); - if (isHead(bucket)) { - // we don't delete the top bucket ever - } else { - if (!mayBalanceWithNeighbors(opCtx, bucket, bucketLoc)) { - // An empty bucket is only allowed as a txnient state. If - // there are no neighbors to balance with, we delete ourself. - // This condition is only expected in legacy btrees. - delBucket(opCtx, bucket, bucketLoc); - } - } - return; - } - deleteInternalKey(opCtx, bucket, bucketLoc, p); - return; - } - - if (left.isNull()) { - _delKeyAtPos(bucket, p); - mayBalanceWithNeighbors(opCtx, bucket, bucketLoc); - } else { - deleteInternalKey(opCtx, bucket, bucketLoc, p); - } -} - -/** - * This function replaces the specified key (k) by either the prev or next key in the btree - * (k'). We require that k have either a left or right child. If k has a left child, we set k' - * to the prev key of k, which must be a leaf present in the left child. If k does not have a - * left child, we set k' to the next key of k, which must be a leaf present in the right child. - * When we replace k with k', we copy k' over k (which may cause a split) and then remove k' - * from its original location. Because k' is stored in a descendent of k, replacing k by k' - * will not modify the storage location of the original k', and we can easily remove k' from its - * original location. - * - * This function is only needed in cases where k has a left or right child; in other cases a - * simpler key removal implementation is possible. - * - * NOTE on noncompliant BtreeBuilder btrees: It is possible (though likely rare) for btrees - * created by BtreeBuilder to have k' that is not a leaf, see SERVER-2732. These cases are - * handled in the same manner as described in the "legacy btree structures" note below. - * - * NOTE on legacy btree structures: In legacy btrees, k' can be a nonleaf. In such a case we - * 'delete' k by marking it as an unused node rather than replacing it with k'. Also, k' may be - * a leaf but marked as an unused node. In such a case we replace k by k', preserving the key's - * unused marking. This function is only expected to mark a key as unused when handling a - * legacy btree. - */ -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::deleteInternalKey(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int keypos) { - DiskLoc lchild = childLocForPos(bucket, keypos); - DiskLoc rchild = childLocForPos(bucket, keypos + 1); - invariant(!lchild.isNull() || !rchild.isNull()); - int advanceDirection = lchild.isNull() ? 1 : -1; - int advanceKeyOfs = keypos; - DiskLoc advanceLoc = advance(opCtx, bucketLoc, &advanceKeyOfs, advanceDirection); - // advanceLoc must be a descentant of thisLoc, because thisLoc has a - // child in the proper direction and all descendants of thisLoc must be - // nonempty because they are not the root. - BucketType* advanceBucket = getBucket(opCtx, advanceLoc); - - if (!childLocForPos(advanceBucket, advanceKeyOfs).isNull() || - !childLocForPos(advanceBucket, advanceKeyOfs + 1).isNull()) { - markUnused(bucket, keypos); - return; - } - - FullKey kn = getFullKey(advanceBucket, advanceKeyOfs); - // Because advanceLoc is a descendant of thisLoc, updating thisLoc will - // not affect packing or keys of advanceLoc and kn will be stable - // during the following setInternalKey() - setInternalKey(opCtx, - bucket, - bucketLoc, - keypos, - kn.recordLoc, - kn.data, - childLocForPos(bucket, keypos), - childLocForPos(bucket, keypos + 1)); - delKeyAtPos(opCtx, btreemod(opCtx, advanceBucket), advanceLoc, advanceKeyOfs); -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::replaceWithNextChild(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc) { - invariant(bucket->n == 0 && !bucket->nextChild.isNull()); - if (bucket->parent.isNull()) { - invariant(getRootLoc(opCtx) == bucketLoc); - _headManager->setHead(opCtx, bucket->nextChild.toRecordId()); - } else { - BucketType* parentBucket = getBucket(opCtx, bucket->parent); - int bucketIndexInParent = indexInParent(opCtx, bucket, bucketLoc); - *opCtx->recoveryUnit()->writing(&childLocForPos(parentBucket, bucketIndexInParent)) = - bucket->nextChild; - } - - *opCtx->recoveryUnit()->writing(&getBucket(opCtx, bucket->nextChild)->parent) = bucket->parent; - _cursorRegistry->invalidateCursorsForBucket(bucketLoc); - deallocBucket(opCtx, bucket, bucketLoc); -} - -template <class BtreeLayout> -bool BtreeLogic<BtreeLayout>::canMergeChildren(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - const int leftIndex) { - invariant(leftIndex >= 0 && leftIndex < bucket->n); - - DiskLoc leftNodeLoc = childLocForPos(bucket, leftIndex); - DiskLoc rightNodeLoc = childLocForPos(bucket, leftIndex + 1); - - if (leftNodeLoc.isNull() || rightNodeLoc.isNull()) { - return false; - } - - int pos = 0; - - BucketType* leftBucket = getBucket(opCtx, leftNodeLoc); - BucketType* rightBucket = getBucket(opCtx, rightNodeLoc); - - int sum = BucketType::HeaderSize + _packedDataSize(leftBucket, pos) + - _packedDataSize(rightBucket, pos) + getFullKey(bucket, leftIndex).data.dataSize() + - sizeof(KeyHeaderType); - - return sum <= BtreeLayout::BucketSize; -} - -/** - * This implementation must respect the meaning and value of lowWaterMark. Also see comments in - * splitPos(). - */ -template <class BtreeLayout> -int BtreeLogic<BtreeLayout>::_rebalancedSeparatorPos(OperationContext* opCtx, - BucketType* bucket, - int leftIndex) { - int split = -1; - int rightSize = 0; - - const BucketType* l = childForPos(opCtx, bucket, leftIndex); - const BucketType* r = childForPos(opCtx, bucket, leftIndex + 1); - - int KNS = sizeof(KeyHeaderType); - int rightSizeLimit = (l->topSize + l->n * KNS + getFullKey(bucket, leftIndex).data.dataSize() + - KNS + r->topSize + r->n * KNS) / - 2; - - // This constraint should be ensured by only calling this function - // if we go below the low water mark. - invariant(rightSizeLimit < BtreeLayout::BucketBodySize); - - for (int i = r->n - 1; i > -1; --i) { - rightSize += getFullKey(r, i).data.dataSize() + KNS; - if (rightSize > rightSizeLimit) { - split = l->n + 1 + i; - break; - } - } - - if (split == -1) { - rightSize += getFullKey(bucket, leftIndex).data.dataSize() + KNS; - if (rightSize > rightSizeLimit) { - split = l->n; - } - } - - if (split == -1) { - for (int i = l->n - 1; i > -1; --i) { - rightSize += getFullKey(l, i).data.dataSize() + KNS; - if (rightSize > rightSizeLimit) { - split = i; - break; - } - } - } - - // safeguards - we must not create an empty bucket - if (split < 1) { - split = 1; - } else if (split > l->n + 1 + r->n - 2) { - split = l->n + 1 + r->n - 2; - } - - return split; -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::doMergeChildren(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int leftIndex) { - DiskLoc leftNodeLoc = childLocForPos(bucket, leftIndex); - DiskLoc rightNodeLoc = childLocForPos(bucket, leftIndex + 1); - - BucketType* l = btreemod(opCtx, getBucket(opCtx, leftNodeLoc)); - BucketType* r = btreemod(opCtx, getBucket(opCtx, rightNodeLoc)); - - int pos = 0; - _packReadyForMod(l, pos); - _packReadyForMod(r, pos); - - // We know the additional keys below will fit in l because canMergeChildren() must be true. - int oldLNum = l->n; - // left child's right child becomes old parent key's left child - FullKey knLeft = getFullKey(bucket, leftIndex); - invariant(pushBack(l, knLeft.recordLoc, knLeft.data, l->nextChild)); - - for (int i = 0; i < r->n; ++i) { - FullKey kn = getFullKey(r, i); - invariant(pushBack(l, kn.recordLoc, kn.data, kn.prevChildBucket)); - } - - l->nextChild = r->nextChild; - fixParentPtrs(opCtx, l, leftNodeLoc, oldLNum); - delBucket(opCtx, r, rightNodeLoc); - - childLocForPos(bucket, leftIndex + 1) = leftNodeLoc; - childLocForPos(bucket, leftIndex) = DiskLoc(); - _delKeyAtPos(bucket, leftIndex, true); - - if (bucket->n == 0) { - // Will trash bucket and bucketLoc. - // - // TODO To ensure all leaves are of equal height, we should ensure this is only called - // on the root. - replaceWithNextChild(opCtx, bucket, bucketLoc); - } else { - mayBalanceWithNeighbors(opCtx, bucket, bucketLoc); - } -} - -template <class BtreeLayout> -int BtreeLogic<BtreeLayout>::indexInParent(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc) const { - invariant(!bucket->parent.isNull()); - const BucketType* p = getBucket(opCtx, bucket->parent); - if (p->nextChild == bucketLoc) { - return p->n; - } - - for (int i = 0; i < p->n; ++i) { - if (getKeyHeader(p, i).prevChildBucket == bucketLoc) { - return i; - } - } - - log() << "ERROR: can't find ref to child bucket.\n"; - log() << "child: " << bucketLoc << "\n"; - // dump(); - log() << "Parent: " << bucket->parent << "\n"; - // p->dump(); - MONGO_UNREACHABLE; -} - -template <class BtreeLayout> -bool BtreeLogic<BtreeLayout>::tryBalanceChildren(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int leftIndex) { - // If we can merge, then we must merge rather than balance to preserve bucket utilization - // constraints. - if (canMergeChildren(opCtx, bucket, bucketLoc, leftIndex)) { - return false; - } - - doBalanceChildren(opCtx, btreemod(opCtx, bucket), bucketLoc, leftIndex); - return true; -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::doBalanceLeftToRight(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int leftIndex, - int split, - BucketType* l, - const DiskLoc lchild, - BucketType* r, - const DiskLoc rchild) { - // TODO maybe do some audits the same way pushBack() does? As a precondition, rchild + the - // old separator are <= half a body size, and lchild is at most completely full. Based on - // the value of split, rchild will get <= half of the total bytes which is at most 75% of a - // full body. So rchild will have room for the following keys: - int rAdd = l->n - split; - reserveKeysFront(r, rAdd); - - for (int i = split + 1, j = 0; i < l->n; ++i, ++j) { - FullKey kn = getFullKey(l, i); - setKey(r, j, kn.recordLoc, kn.data, kn.prevChildBucket); - } - - FullKey leftIndexKN = getFullKey(bucket, leftIndex); - setKey(r, rAdd - 1, leftIndexKN.recordLoc, leftIndexKN.data, l->nextChild); - - fixParentPtrs(opCtx, r, rchild, 0, rAdd - 1); - - FullKey kn = getFullKey(l, split); - l->nextChild = kn.prevChildBucket; - - // Because lchild is a descendant of thisLoc, updating thisLoc will not affect packing or - // keys of lchild and kn will be stable during the following setInternalKey() - setInternalKey(opCtx, bucket, bucketLoc, leftIndex, kn.recordLoc, kn.data, lchild, rchild); - - // lchild and rchild cannot be merged, so there must be >0 (actually more) keys to the left - // of split. - int zeropos = 0; - truncateTo(l, split, zeropos); -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::doBalanceRightToLeft(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int leftIndex, - int split, - BucketType* l, - const DiskLoc lchild, - BucketType* r, - const DiskLoc rchild) { - // As a precondition, lchild + the old separator are <= half a body size, - // and rchild is at most completely full. Based on the value of split, - // lchild will get less than half of the total bytes which is at most 75% - // of a full body. So lchild will have room for the following keys: - int lN = l->n; - - { - // left child's right child becomes old parent key's left child - FullKey kn = getFullKey(bucket, leftIndex); - invariant(pushBack(l, kn.recordLoc, kn.data, l->nextChild)); - } - - for (int i = 0; i < split - lN - 1; ++i) { - FullKey kn = getFullKey(r, i); - invariant(pushBack(l, kn.recordLoc, kn.data, kn.prevChildBucket)); - } - - { - FullKey kn = getFullKey(r, split - lN - 1); - l->nextChild = kn.prevChildBucket; - // Child lN was lchild's old nextChild, and don't need to fix that one. - fixParentPtrs(opCtx, l, lchild, lN + 1, l->n); - // Because rchild is a descendant of thisLoc, updating thisLoc will - // not affect packing or keys of rchild and kn will be stable - // during the following setInternalKey() - setInternalKey(opCtx, bucket, bucketLoc, leftIndex, kn.recordLoc, kn.data, lchild, rchild); - } - - // lchild and rchild cannot be merged, so there must be >0 (actually more) - // keys to the right of split. - int zeropos = 0; - dropFront(r, split - lN, zeropos); -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::doBalanceChildren(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int leftIndex) { - DiskLoc lchild = childLocForPos(bucket, leftIndex); - DiskLoc rchild = childLocForPos(bucket, leftIndex + 1); - - int zeropos = 0; - BucketType* l = btreemod(opCtx, getBucket(opCtx, lchild)); - _packReadyForMod(l, zeropos); - - BucketType* r = btreemod(opCtx, getBucket(opCtx, rchild)); - _packReadyForMod(r, zeropos); - - int split = _rebalancedSeparatorPos(opCtx, bucket, leftIndex); - - // By definition, if we are below the low water mark and cannot merge - // then we must actively balance. - invariant(split != l->n); - if (split < l->n) { - doBalanceLeftToRight(opCtx, bucket, bucketLoc, leftIndex, split, l, lchild, r, rchild); - } else { - doBalanceRightToLeft(opCtx, bucket, bucketLoc, leftIndex, split, l, lchild, r, rchild); - } -} - -template <class BtreeLayout> -bool BtreeLogic<BtreeLayout>::mayBalanceWithNeighbors(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc) { - if (bucket->parent.isNull()) { - return false; - } - - if (_packedDataSize(bucket, 0) >= lowWaterMark()) { - return false; - } - - BucketType* p = getBucket(opCtx, bucket->parent); - int parentIdx = indexInParent(opCtx, bucket, bucketLoc); - - // TODO will missing neighbor case be possible long term? Should we try to merge/balance - // somehow in that case if so? - bool mayBalanceRight = (parentIdx < p->n) && !childLocForPos(p, parentIdx + 1).isNull(); - bool mayBalanceLeft = (parentIdx > 0) && !childLocForPos(p, parentIdx - 1).isNull(); - - // Balance if possible on one side - we merge only if absolutely necessary to preserve btree - // bucket utilization constraints since that's a more heavy duty operation (especially if we - // must re-split later). - if (mayBalanceRight && tryBalanceChildren(opCtx, p, bucket->parent, parentIdx)) { - return true; - } - - if (mayBalanceLeft && tryBalanceChildren(opCtx, p, bucket->parent, parentIdx - 1)) { - return true; - } - - BucketType* pm = btreemod(opCtx, getBucket(opCtx, bucket->parent)); - if (mayBalanceRight) { - doMergeChildren(opCtx, pm, bucket->parent, parentIdx); - return true; - } else if (mayBalanceLeft) { - doMergeChildren(opCtx, pm, bucket->parent, parentIdx - 1); - return true; - } - - return false; -} - -template <class BtreeLayout> -bool BtreeLogic<BtreeLayout>::unindex(OperationContext* opCtx, - const BSONObj& key, - const DiskLoc& recordLoc) { - int pos; - bool found = false; - KeyDataOwnedType ownedKey(key); - - DiskLoc loc = _locate(opCtx, getRootLoc(opCtx), ownedKey, &pos, &found, recordLoc, 1); - if (found) { - BucketType* bucket = btreemod(opCtx, getBucket(opCtx, loc)); - delKeyAtPos(opCtx, bucket, loc, pos); - assertValid(_indexName, getRoot(opCtx), _ordering); - } - return found; -} - -template <class BtreeLayout> -bool BtreeLogic<BtreeLayout>::isEmpty(OperationContext* opCtx) const { - return getRoot(opCtx)->n == 0; -} - -/** - * This can cause a lot of additional page writes when we assign buckets to different parents. - * Maybe get rid of parent ptrs? - */ -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::fixParentPtrs(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int firstIndex, - int lastIndex) { - invariant(getBucket(opCtx, bucketLoc) == bucket); - - if (lastIndex == -1) { - lastIndex = bucket->n; - } - - for (int i = firstIndex; i <= lastIndex; i++) { - const DiskLoc childLoc = childLocForPos(bucket, i); - if (!childLoc.isNull()) { - *opCtx->recoveryUnit()->writing(&getBucket(opCtx, childLoc)->parent) = bucketLoc; - } - } -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::setInternalKey(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int keypos, - const DiskLoc recordLoc, - const KeyDataType& key, - const DiskLoc lchild, - const DiskLoc rchild) { - childLocForPos(bucket, keypos).Null(); - // This may leave the bucket empty (n == 0) which is ok only as a txnient state. In the - // instant case, the implementation of insertHere behaves correctly when n == 0 and as a - // side effect increments n. - _delKeyAtPos(bucket, keypos, true); - - // Ensure we do not orphan neighbor's old child. - invariant(childLocForPos(bucket, keypos) == rchild); - - // Just set temporarily - required to pass validation in insertHere() - childLocForPos(bucket, keypos) = lchild; - - insertHere(opCtx, bucketLoc, keypos, key, recordLoc, lchild, rchild); -} - -/** - * insert a key in this bucket, splitting if necessary. - * - * @keypos - where to insert the key in range 0..n. 0=make leftmost, n=make rightmost. NOTE - * this function may free some data, and as a result the value passed for keypos may be invalid - * after calling insertHere() - * - * Some of the write intent signaling below relies on the implementation of the optimized write - * intent code in basicInsert(). - */ -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::insertHere(OperationContext* opCtx, - const DiskLoc bucketLoc, - int pos, - const KeyDataType& key, - const DiskLoc recordLoc, - const DiskLoc leftChildLoc, - const DiskLoc rightChildLoc) { - BucketType* bucket = getBucket(opCtx, bucketLoc); - - if (!basicInsert(opCtx, bucket, bucketLoc, pos, key, recordLoc)) { - // If basicInsert() fails, the bucket will be packed as required by split(). - split(opCtx, - btreemod(opCtx, bucket), - bucketLoc, - pos, - recordLoc, - key, - leftChildLoc, - rightChildLoc); - return; - } - - KeyHeaderType* kn = &getKeyHeader(bucket, pos); - if (pos + 1 == bucket->n) { - // It's the last key. - if (bucket->nextChild != leftChildLoc) { - // XXX log more - MONGO_UNREACHABLE; - } - kn->prevChildBucket = bucket->nextChild; - invariant(kn->prevChildBucket == leftChildLoc); - *opCtx->recoveryUnit()->writing(&bucket->nextChild) = rightChildLoc; - if (!rightChildLoc.isNull()) { - *opCtx->recoveryUnit()->writing(&getBucket(opCtx, rightChildLoc)->parent) = bucketLoc; - } - } else { - kn->prevChildBucket = leftChildLoc; - if (getKeyHeader(bucket, pos + 1).prevChildBucket != leftChildLoc) { - // XXX: log more - MONGO_UNREACHABLE; - } - const LocType* pc = &getKeyHeader(bucket, pos + 1).prevChildBucket; - // Intent declared in basicInsert() - *const_cast<LocType*>(pc) = rightChildLoc; - if (!rightChildLoc.isNull()) { - *opCtx->recoveryUnit()->writing(&getBucket(opCtx, rightChildLoc)->parent) = bucketLoc; - } - } -} - -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::split(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int keypos, - const DiskLoc recordLoc, - const KeyDataType& key, - const DiskLoc lchild, - const DiskLoc rchild) { - int split = splitPos(bucket, keypos); - DiskLoc rLoc = _addBucket(opCtx); - BucketType* r = btreemod(opCtx, getBucket(opCtx, rLoc)); - - for (int i = split + 1; i < bucket->n; i++) { - FullKey kn = getFullKey(bucket, i); - invariant(pushBack(r, kn.recordLoc, kn.data, kn.prevChildBucket)); - } - r->nextChild = bucket->nextChild; - assertValid(_indexName, r, _ordering); - - r = NULL; - fixParentPtrs(opCtx, getBucket(opCtx, rLoc), rLoc); - - FullKey splitkey = getFullKey(bucket, split); - // splitkey key gets promoted, its children will be thisLoc (l) and rLoc (r) - bucket->nextChild = splitkey.prevChildBucket; - - // Because thisLoc is a descendant of parent, updating parent will not affect packing or - // keys of thisLoc and splitkey will be stable during the following: - - if (bucket->parent.isNull()) { - // promote splitkey to a parent this->node make a new parent if we were the root - DiskLoc L = _addBucket(opCtx); - BucketType* p = btreemod(opCtx, getBucket(opCtx, L)); - invariant(pushBack(p, splitkey.recordLoc, splitkey.data, bucketLoc)); - p->nextChild = rLoc; - assertValid(_indexName, p, _ordering); - bucket->parent = L; - _headManager->setHead(opCtx, L.toRecordId()); - *opCtx->recoveryUnit()->writing(&getBucket(opCtx, rLoc)->parent) = bucket->parent; - } else { - // set this before calling _insert - if it splits it will do fixParent() logic and - // change the value. - *opCtx->recoveryUnit()->writing(&getBucket(opCtx, rLoc)->parent) = bucket->parent; - _insert(opCtx, - getBucket(opCtx, bucket->parent), - bucket->parent, - splitkey.data, - splitkey.recordLoc, - true, // dupsallowed - bucketLoc, - rLoc) - .transitional_ignore(); - } - - int newpos = keypos; - // note this may trash splitkey.key. thus we had to promote it before finishing up here. - truncateTo(bucket, split, newpos); - - // add our this->new key, there is room this->now - if (keypos <= split) { - insertHere(opCtx, bucketLoc, newpos, key, recordLoc, lchild, rchild); - } else { - int kp = keypos - split - 1; - invariant(kp >= 0); - insertHere(opCtx, rLoc, kp, key, recordLoc, lchild, rchild); - } -} - -class DummyDocWriter final : public DocWriter { -public: - DummyDocWriter(size_t sz) : _sz(sz) {} - virtual void writeDocument(char* buf) const { /* no-op */ - } - virtual size_t documentSize() const { - return _sz; - } - -private: - size_t _sz; -}; - -template <class BtreeLayout> -Status BtreeLogic<BtreeLayout>::initAsEmpty(OperationContext* opCtx) { - if (!_headManager->getHead(opCtx).isNull()) { - return Status(ErrorCodes::InternalError, "index already initialized"); - } - - _headManager->setHead(opCtx, _addBucket(opCtx).toRecordId()); - return Status::OK(); -} - -template <class BtreeLayout> -DiskLoc BtreeLogic<BtreeLayout>::_addBucket(OperationContext* opCtx) { - DummyDocWriter docWriter(BtreeLayout::BucketSize); - StatusWith<RecordId> loc = - _recordStore->insertRecordWithDocWriter(opCtx, &docWriter, Timestamp()); - // XXX: remove this(?) or turn into massert or sanely bubble it back up. - uassertStatusOK(loc.getStatus()); - - // this is a new bucket, not referenced by anyone, probably don't need this lock - BucketType* b = btreemod(opCtx, getBucket(opCtx, loc.getValue())); - init(b); - return DiskLoc::fromRecordId(loc.getValue()); -} - -// static -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::dumpBucket(const BucketType* bucket, int indentLength) { - log() << "BUCKET n:" << bucket->n << ", parent:" << hex << bucket->parent.getOfs() << dec; - - const string indent = string(indentLength, ' '); - - for (int i = 0; i < bucket->n; i++) { - FullKey k = getFullKey(bucket, i); - string ks = k.data.toString(); - log() << indent << " " << hex << k.prevChildBucket.getOfs() << "<-- prevChildBucket for " - << i; - log() << indent << " " << i << ' ' << redact(ks.substr(0, 60)) - << " Loc:" << k.recordLoc.toString() << dec; - if (getKeyHeader(bucket, i).isUnused()) { - log() << " UNUSED"; - } - } - - log() << indent << " " << hex << bucket->nextChild.getOfs() << dec << "<-- nextChild bucket"; -} - -template <class BtreeLayout> -DiskLoc BtreeLogic<BtreeLayout>::getDiskLoc(OperationContext* opCtx, - const DiskLoc& bucketLoc, - const int keyOffset) const { - invariant(!bucketLoc.isNull()); - BucketType* bucket = getBucket(opCtx, bucketLoc); - return getKeyHeader(bucket, keyOffset).recordLoc; -} - -template <class BtreeLayout> -BSONObj BtreeLogic<BtreeLayout>::getKey(OperationContext* opCtx, - const DiskLoc& bucketLoc, - const int keyOffset) const { - invariant(!bucketLoc.isNull()); - BucketType* bucket = getBucket(opCtx, bucketLoc); - int n = bucket->n; - invariant(n != BtreeLayout::INVALID_N_SENTINEL); - invariant(n >= 0); - invariant(n < 10000); - invariant(n != 0xffff); - - invariant(keyOffset >= 0); - invariant(keyOffset < n); - - // XXX: should we really return an empty obj if keyOffset>=n? - if (keyOffset >= n) { - return BSONObj(); - } else { - return getFullKey(bucket, keyOffset).data.toBson(); - } -} - -template <class BtreeLayout> -IndexKeyEntry BtreeLogic<BtreeLayout>::getRandomEntry(OperationContext* opCtx) const { - // To ensure a uniform distribution, all keys must have an equal probability of being selected. - // Specifically, a key from the root should have the same probability of being selected as a key - // from a leaf. - // - // Here we do a random walk until we get to a leaf, storing a random key from each bucket along - // the way down. Because the root is always present in the random walk, but any given leaf would - // seldom be seen, we assign weights to each key such that the key from the leaf is much more - // likely to be selected than the key from the root. These weights attempt to ensure each entry - // is equally likely to be selected and avoid bias towards the entries closer to the root. - // - // As a simplification, we treat all buckets in a given level as having the same number of - // children. While this is inaccurate if the tree isn't perfectly balanced or if key-size - // greatly varies, it is assumed to be good enough for this purpose. - invariant(!isEmpty(opCtx)); - BucketType* root = getRoot(opCtx); - - vector<int64_t> nKeysInLevel; - vector<FullKey> selectedKeys; - - auto& prng = opCtx->getClient()->getPrng(); - - int nRetries = 0; - const int kMaxRetries = 5; - do { - // See documentation below for description of parameters. - recordRandomWalk(opCtx, &prng, root, 1, &nKeysInLevel, &selectedKeys); - } while (selectedKeys.empty() && nRetries++ < kMaxRetries); - massert(28826, - str::stream() << "index " << _indexName << " may be corrupt, please repair", - !selectedKeys.empty()); - - invariant(nKeysInLevel.size() == selectedKeys.size()); - // Select a key from the random walk such that each key from the B-tree has an equal probability - // of being selected. - // - // Let N be the sum of 'nKeysInLevel'. That is, the total number of keys in the B-tree. - // - // On our walk down the tree, we selected exactly one key from each level of the B-tree, where - // 'selectedKeys[i]' came from the ith level of the tree. On any given level, each key has an - // equal probability of being selected. Specifically, a key on level i has a probability of - // 1/'nKeysInLevel[i]' of being selected as 'selectedKeys[i]'. Then if, given our selected keys, - // we choose to return 'selectedKeys[i]' with a probability of 'nKeysInLevel[i]'/N, that key - // will be returned with a probability of 1/'nKeysInLevel[i]' * 'nKeysInLevel[i]'/N = 1/N. - // - // So 'selectedKeys[i]' should have a probability of 'nKeysInLevel[i]'/N of being returned. We - // will do so by picking a random number X in the range [0, N). Then, if X is in the first - // 'nKeysInLevel[0]' numbers, we will return 'selectedKeys[0]'. If X is in the next - // 'nKeysInLevel[1]' numbers, we will return 'selectedKeys[1]', and so on. - int64_t choice = prng.nextInt64(std::accumulate(nKeysInLevel.begin(), nKeysInLevel.end(), 0)); - for (size_t i = 0; i < nKeysInLevel.size(); i++) { - if (choice < nKeysInLevel[i]) { - return {selectedKeys[i].data.toBson(), selectedKeys[i].header.recordLoc.toRecordId()}; - } - choice -= nKeysInLevel[i]; - } - MONGO_UNREACHABLE; -} - -/** - * Does a random walk through the tree, recording information about the walk along the way. - * - * 'nKeysInLevel' will be filled in such that 'nKeysInLevel[i]' is an approximation of the number of - * keys in the ith level of the B-tree. - * - * 'selectedKeys' will be filled in such that 'selectedKeys[i]' will be a pseudo-random key selected - * from the bucket we went through on the ith level of the B-tree. - */ -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::recordRandomWalk(OperationContext* opCtx, - PseudoRandom* prng, - BucketType* curBucket, - int64_t nBucketsInCurrentLevel, - vector<int64_t>* nKeysInLevel, - vector<FullKey>* selectedKeys) const { - // Select a random key from this bucket, and record it. - int nKeys = curBucket->n; - int keyToReturn = prng->nextInt32(nKeys); - auto fullKey = getFullKey(curBucket, keyToReturn); - // If the key is not used, just skip this level. - if (fullKey.header.isUsed()) { - selectedKeys->push_back(std::move(fullKey)); - nKeysInLevel->push_back(nBucketsInCurrentLevel * nKeys); - } - - // Select a random child and descend (if there are any). - int nChildren = nKeys + 1; - int nextChild = prng->nextInt32(nChildren); - if (auto child = childForPos(opCtx, curBucket, nextChild)) { - recordRandomWalk( - opCtx, prng, child, nBucketsInCurrentLevel * nChildren, nKeysInLevel, selectedKeys); - } -} - -template <class BtreeLayout> -Status BtreeLogic<BtreeLayout>::touch(OperationContext* opCtx) const { - return _recordStore->touch(opCtx, NULL); -} - -template <class BtreeLayout> -long long BtreeLogic<BtreeLayout>::fullValidate(OperationContext* opCtx, - long long* unusedCount, - bool strict, - bool dumpBuckets, - unsigned depth) const { - return _fullValidate(opCtx, getRootLoc(opCtx), unusedCount, strict, dumpBuckets, depth); -} - -template <class BtreeLayout> -long long BtreeLogic<BtreeLayout>::_fullValidate(OperationContext* opCtx, - const DiskLoc bucketLoc, - long long* unusedCount, - bool strict, - bool dumpBuckets, - unsigned depth) const { - BucketType* bucket = getBucket(opCtx, bucketLoc); - assertValid(_indexName, bucket, _ordering, true); - - if (dumpBuckets) { - log() << bucketLoc.toString() << ' '; - dumpBucket(bucket, depth); - } - - long long keyCount = 0; - - for (int i = 0; i < bucket->n; i++) { - KeyHeaderType& kn = getKeyHeader(bucket, i); - - if (kn.isUsed()) { - keyCount++; - } else if (NULL != unusedCount) { - ++(*unusedCount); - } - - if (!kn.prevChildBucket.isNull()) { - DiskLoc left = kn.prevChildBucket; - BucketType* b = getBucket(opCtx, left); - - if (strict) { - invariant(b->parent == bucketLoc); - } else if (b->parent != bucketLoc) { - warning() << "index corruption detected: b->parent != bucketLoc"; - } - - keyCount += _fullValidate(opCtx, left, unusedCount, strict, dumpBuckets, depth + 1); - } - } - - if (!bucket->nextChild.isNull()) { - BucketType* b = getBucket(opCtx, bucket->nextChild); - if (strict) { - invariant(b->parent == bucketLoc); - } else if (b->parent != bucketLoc) { - warning() << "index corruption detected: b->parent != bucketLoc"; - } - - keyCount += - _fullValidate(opCtx, bucket->nextChild, unusedCount, strict, dumpBuckets, depth + 1); - } - - return keyCount; -} - -// XXX: remove this(?) used to not dump every key in assertValid. -int nDumped = 0; - -// static -template <class BtreeLayout> -void BtreeLogic<BtreeLayout>::assertValid(const std::string& ns, - BucketType* bucket, - const Ordering& ordering, - bool force) { - if (!force) { - return; - } - - // this is very slow so don't do often - { - static int _k; - if (++_k % 128) { - return; - } - } - - DEV { - // slow: - for (int i = 0; i < bucket->n - 1; i++) { - FullKey firstKey = getFullKey(bucket, i); - FullKey secondKey = getFullKey(bucket, i + 1); - int z = firstKey.data.woCompare(secondKey.data, ordering); - if (z > 0) { - log() << "ERROR: btree key order corrupt. Keys:" << endl; - if (++nDumped < 5) { - for (int j = 0; j < bucket->n; j++) { - log() << " " << redact(getFullKey(bucket, j).data.toString()) << endl; - } - dumpBucket(bucket); - } - MONGO_UNREACHABLE; - break; - } else if (z == 0) { - if (!(firstKey.header.recordLoc < secondKey.header.recordLoc)) { - log() << "ERROR: btree key order corrupt (recordlocs wrong):" << endl; - log() << " k(" << i << ")" << redact(firstKey.data.toString()) - << " RL:" << firstKey.header.recordLoc.toString() << endl; - log() << " k(" << i + 1 << ")" << redact(secondKey.data.toString()) - << " RL:" << secondKey.header.recordLoc.toString() << endl; - invariant(firstKey.header.recordLoc < secondKey.header.recordLoc); - } - } - } - } - else { - // faster: - if (bucket->n > 1) { - FullKey k1 = getFullKey(bucket, 0); - FullKey k2 = getFullKey(bucket, bucket->n - 1); - int z = k1.data.woCompare(k2.data, ordering); - if (z > 0) { - log() << "Btree keys out of order in collection " << ns; - std::call_once(assertValidFlag, [&bucket]() { dumpBucket(bucket); }); - MONGO_UNREACHABLE; - } - } - } -} - -template <class BtreeLayout> -Status BtreeLogic<BtreeLayout>::insert(OperationContext* opCtx, - const BSONObj& rawKey, - const DiskLoc& value, - bool dupsAllowed) { - KeyDataOwnedType key(rawKey); - - if (key.dataSize() > BtreeLayout::KeyMax) { - string msg = str::stream() << "Btree::insert: key too large to index, failing " - << _indexName << ' ' << key.dataSize() << ' ' << key.toString(); - return Status(ErrorCodes::KeyTooLong, msg); - } - - Status status = _insert( - opCtx, getRoot(opCtx), getRootLoc(opCtx), key, value, dupsAllowed, DiskLoc(), DiskLoc()); - - assertValid(_indexName, getRoot(opCtx), _ordering); - return status; -} - -template <class BtreeLayout> -Status BtreeLogic<BtreeLayout>::_insert(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - const KeyDataType& key, - const DiskLoc recordLoc, - bool dupsAllowed, - const DiskLoc leftChild, - const DiskLoc rightChild) { - invariant(key.dataSize() > 0); - - int pos; - bool found; - Status findStatus = _find(opCtx, bucket, key, recordLoc, !dupsAllowed, &pos, &found); - if (!findStatus.isOK()) { - return findStatus; - } - - if (found) { - KeyHeaderType& header = getKeyHeader(bucket, pos); - if (header.isUnused()) { - LOG(4) << "btree _insert: reusing unused key" << endl; - massert(17433, "_insert: reuse key but lchild is not null", leftChild.isNull()); - massert(17434, "_insert: reuse key but rchild is not null", rightChild.isNull()); - opCtx->recoveryUnit()->writing(&header)->setUsed(); - return Status::OK(); - } - // The logic in _find() prohibits finding and returning a position if the 'used' bit - // in the header is set and dups are disallowed. - invariant(dupsAllowed); - - // The key and value are already in the index. Not an error because documents that have - // already been indexed may be seen again due to updates during a background index scan. - return Status::OK(); - } - - DiskLoc childLoc = childLocForPos(bucket, pos); - - // In current usage, rightChild is NULL for a new key and is not NULL when we are - // promoting a split key. These are the only two cases where _insert() is called - // currently. - if (childLoc.isNull() || !rightChild.isNull()) { - insertHere(opCtx, bucketLoc, pos, key, recordLoc, leftChild, rightChild); - return Status::OK(); - } else { - return _insert(opCtx, - getBucket(opCtx, childLoc), - childLoc, - key, - recordLoc, - dupsAllowed, - DiskLoc(), - DiskLoc()); - } -} - -template <class BtreeLayout> -DiskLoc BtreeLogic<BtreeLayout>::advance(OperationContext* opCtx, - const DiskLoc& bucketLoc, - int* posInOut, - int direction) const { - BucketType* bucket = getBucket(opCtx, bucketLoc); - - if (*posInOut < 0 || *posInOut >= bucket->n) { - log() << "ASSERT failure advancing btree bucket" << endl; - log() << " thisLoc: " << bucketLoc.toString() << endl; - log() << " keyOfs: " << *posInOut << " n:" << bucket->n << " direction: " << direction - << endl; - // log() << bucketSummary() << endl; - MONGO_UNREACHABLE; - } - - // XXX document - int adj = direction < 0 ? 1 : 0; - int ko = *posInOut + direction; - - // Look down if we need to. - DiskLoc nextDownLoc = childLocForPos(bucket, ko + adj); - BucketType* nextDown = getBucket(opCtx, nextDownLoc); - if (NULL != nextDown) { - for (;;) { - if (direction > 0) { - *posInOut = 0; - } else { - *posInOut = nextDown->n - 1; - } - DiskLoc newNextDownLoc = childLocForPos(nextDown, *posInOut + adj); - BucketType* newNextDownBucket = getBucket(opCtx, newNextDownLoc); - if (NULL == newNextDownBucket) { - break; - } - nextDownLoc = newNextDownLoc; - nextDown = newNextDownBucket; - } - return nextDownLoc; - } - - // Looking down isn't the right choice, move forward. - if (ko < bucket->n && ko >= 0) { - *posInOut = ko; - return bucketLoc; - } - - // Hit the end of the bucket, move up and over. - DiskLoc childLoc = bucketLoc; - DiskLoc ancestor = getBucket(opCtx, bucketLoc)->parent; - for (;;) { - if (ancestor.isNull()) { - break; - } - BucketType* an = getBucket(opCtx, ancestor); - for (int i = 0; i < an->n; i++) { - if (childLocForPos(an, i + adj) == childLoc) { - *posInOut = i; - return ancestor; - } - } - invariant(direction < 0 || an->nextChild == childLoc); - // parent exhausted also, keep going up - childLoc = ancestor; - ancestor = an->parent; - } - - return DiskLoc(); -} - -template <class BtreeLayout> -bool BtreeLogic<BtreeLayout>::keyIsUsed(OperationContext* opCtx, - const DiskLoc& loc, - const int& pos) const { - return getKeyHeader(getBucket(opCtx, loc), pos).isUsed(); -} - -template <class BtreeLayout> -bool BtreeLogic<BtreeLayout>::locate(OperationContext* opCtx, - const BSONObj& key, - const DiskLoc& recordLoc, - const int direction, - int* posOut, - DiskLoc* bucketLocOut) const { - // Clear out any data. - *posOut = 0; - *bucketLocOut = DiskLoc(); - - bool found = false; - KeyDataOwnedType owned(key); - - *bucketLocOut = _locate(opCtx, getRootLoc(opCtx), owned, posOut, &found, recordLoc, direction); - - skipUnusedKeys(opCtx, bucketLocOut, posOut, direction); - - return found; -} - -/** - * Recursively walk down the btree, looking for a match of key and recordLoc. - * Caller should have acquired lock on bucketLoc. - */ -template <class BtreeLayout> -DiskLoc BtreeLogic<BtreeLayout>::_locate(OperationContext* opCtx, - const DiskLoc& bucketLoc, - const KeyDataType& key, - int* posOut, - bool* foundOut, - const DiskLoc& recordLoc, - const int direction) const { - int position; - BucketType* bucket = getBucket(opCtx, bucketLoc); - // XXX: owned to not owned conversion(?) - _find(opCtx, bucket, key, recordLoc, false, &position, foundOut).transitional_ignore(); - - // Look in our current bucket. - if (*foundOut) { - *posOut = position; - return bucketLoc; - } - - // Not in our current bucket. 'position' tells us where there may be a child. - DiskLoc childLoc = childLocForPos(bucket, position); - - if (!childLoc.isNull()) { - DiskLoc inChild = _locate(opCtx, childLoc, key, posOut, foundOut, recordLoc, direction); - if (!inChild.isNull()) { - return inChild; - } - } - - *posOut = position; - - if (direction < 0) { - // The key *would* go to our left. - (*posOut)--; - if (-1 == *posOut) { - // But there's no space for that in our bucket. - return DiskLoc(); - } else { - return bucketLoc; - } - } else { - // The key would go to our right... - if (bucket->n == *posOut) { - return DiskLoc(); - } else { - // But only if there is space. - return bucketLoc; - } - } -} - -// TODO relcoate -template <class BtreeLayout> -bool BtreeLogic<BtreeLayout>::isHead(BucketType* bucket) { - return bucket->parent.isNull(); -} - -template <class BtreeLayout> -typename BtreeLogic<BtreeLayout>::BucketType* BtreeLogic<BtreeLayout>::getBucket( - OperationContext* opCtx, const RecordId id) const { - if (id.isNull()) { - return NULL; - } - - RecordData recordData = _recordStore->dataFor(opCtx, id); - - // we need to be working on the raw bytes, not a transient copy - invariant(!recordData.isOwned()); - - return reinterpret_cast<BucketType*>(const_cast<char*>(recordData.data())); -} - -template <class BtreeLayout> -typename BtreeLogic<BtreeLayout>::BucketType* BtreeLogic<BtreeLayout>::getRoot( - OperationContext* opCtx) const { - return getBucket(opCtx, _headManager->getHead(opCtx)); -} - -template <class BtreeLayout> -DiskLoc BtreeLogic<BtreeLayout>::getRootLoc(OperationContext* opCtx) const { - return DiskLoc::fromRecordId(_headManager->getHead(opCtx)); -} - -template <class BtreeLayout> -typename BtreeLogic<BtreeLayout>::BucketType* BtreeLogic<BtreeLayout>::childForPos( - OperationContext* opCtx, BucketType* bucket, int pos) const { - DiskLoc loc = childLocForPos(bucket, pos); - return getBucket(opCtx, loc); -} - -template <class BtreeLayout> -typename BtreeLogic<BtreeLayout>::LocType& BtreeLogic<BtreeLayout>::childLocForPos( - BucketType* bucket, int pos) { - if (bucket->n == pos) { - return bucket->nextChild; - } else { - return getKeyHeader(bucket, pos).prevChildBucket; - } -} - -// -// And, template stuff. -// - -// V0 format. -template struct FixedWidthKey<DiskLoc>; -template class BtreeLogic<BtreeLayoutV0>; - -// V1 format. -template struct FixedWidthKey<DiskLoc56Bit>; -template class BtreeLogic<BtreeLayoutV1>; - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic.h b/src/mongo/db/storage/mmap_v1/btree/btree_logic.h deleted file mode 100644 index 1f6f0645875..00000000000 --- a/src/mongo/db/storage/mmap_v1/btree/btree_logic.h +++ /dev/null @@ -1,587 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include <string> - -#include "mongo/db/catalog/head_manager.h" -#include "mongo/db/catalog/index_catalog_entry.h" -#include "mongo/db/jsobj.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/storage/index_entry_comparison.h" -#include "mongo/db/storage/mmap_v1/btree/btree_ondisk.h" -#include "mongo/db/storage/mmap_v1/btree/key.h" -#include "mongo/db/storage/mmap_v1/diskloc.h" - -namespace mongo { - -class PseudoRandom; -class RecordStore; -class SavedCursorRegistry; - -// Used for unit-testing only -template <class BtreeLayout> -class BtreeLogicTestBase; -template <class BtreeLayout> -class ArtificialTreeBuilder; - -/** - * This is the logic for manipulating the Btree. It is (mostly) independent of the on-disk - * format. - */ -template <class BtreeLayout> -class BtreeLogic { -public: - // AKA _keyNode - typedef typename BtreeLayout::FixedWidthKeyType KeyHeaderType; - - // AKA Key - typedef typename BtreeLayout::KeyType KeyDataType; - - // AKA KeyOwned - typedef typename BtreeLayout::KeyOwnedType KeyDataOwnedType; - - // AKA Loc - typedef typename BtreeLayout::LocType LocType; - - // AKA BucketBasics or BtreeBucket, either one. - typedef typename BtreeLayout::BucketType BucketType; - - /** - * 'head' manages the catalog information. - * 'store' allocates and frees buckets. - * 'ordering' is meta-information we store in the catalog. - * 'indexName' is a string identifying the index that we use to print errors with. - */ - BtreeLogic(HeadManager* head, - RecordStore* store, - SavedCursorRegistry* cursors, - const Ordering& ordering, - const std::string& indexName, - bool isUnique) - : _headManager(head), - _recordStore(store), - _cursorRegistry(cursors), - _ordering(ordering), - _indexName(indexName), - _isUnique(isUnique) {} - - // - // Public-facing - // - - class Builder { - public: - typedef typename BtreeLayout::KeyOwnedType KeyDataOwnedType; - typedef typename BtreeLayout::KeyType KeyDataType; - - Status addKey(const BSONObj& key, const DiskLoc& loc); - - private: - friend class BtreeLogic; - - class SetRightLeafLocChange; - - Builder(BtreeLogic* logic, OperationContext* opCtx, bool dupsAllowed); - - /** - * Creates and returns a new empty bucket to the right of leftSib, maintaining the - * internal consistency of the tree. leftSib must be the right-most child of its parent - * or it must be the root. - */ - DiskLoc newBucket(BucketType* leftSib, DiskLoc leftSibLoc); - - BucketType* _getModifiableBucket(DiskLoc loc); - BucketType* _getBucket(DiskLoc loc); - - // Not owned. - BtreeLogic* _logic; - - DiskLoc _rightLeafLoc; // DiskLoc of right-most (highest) leaf bucket. - bool _dupsAllowed; - std::unique_ptr<KeyDataOwnedType> _keyLast; - - // Not owned. - OperationContext* _opCtx; - }; - - /** - * Caller owns the returned pointer. - * 'this' must outlive the returned pointer. - */ - Builder* newBuilder(OperationContext* opCtx, bool dupsAllowed); - - Status dupKeyCheck(OperationContext* opCtx, const BSONObj& key, const DiskLoc& loc) const; - - Status insert(OperationContext* opCtx, - const BSONObj& rawKey, - const DiskLoc& value, - bool dupsAllowed); - - /** - * Navigates down the tree and locates the bucket and position containing a record with - * the specified <key, recordLoc> combination. - * - * @return true if the exact <key, recordLoc> was found. Otherwise, false and the - * bucketLocOut would contain the bucket containing key which is before or after the - * searched one (dependent on the direction). - */ - bool locate(OperationContext* opCtx, - const BSONObj& key, - const DiskLoc& recordLoc, - const int direction, - int* posOut, - DiskLoc* bucketLocOut) const; - - void advance(OperationContext* opCtx, - DiskLoc* bucketLocInOut, - int* posInOut, - int direction) const; - - bool exists(OperationContext* opCtx, const KeyDataType& key) const; - - bool unindex(OperationContext* opCtx, const BSONObj& key, const DiskLoc& recordLoc); - - bool isEmpty(OperationContext* opCtx) const; - - long long fullValidate(OperationContext*, - long long* unusedCount, - bool strict, - bool dumpBuckets, - unsigned depth) const; - - DiskLoc getDiskLoc(OperationContext* opCtx, - const DiskLoc& bucketLoc, - const int keyOffset) const; - - BSONObj getKey(OperationContext* opCtx, const DiskLoc& bucketLoc, const int keyOffset) const; - - /** - * Returns a pseudo-random element from the tree. It is an error to call this method if the tree - * is empty. - */ - IndexKeyEntry getRandomEntry(OperationContext* opCtx) const; - - DiskLoc getHead(OperationContext* opCtx) const { - return DiskLoc::fromRecordId(_headManager->getHead(opCtx)); - } - - Status touch(OperationContext* opCtx) const; - - // - // Composite key navigation methods - // - - void customLocate(OperationContext* opCtx, - DiskLoc* locInOut, - int* keyOfsInOut, - const IndexSeekPoint& seekPoint, - int direction) const; - - void advanceTo(OperationContext*, - DiskLoc* thisLocInOut, - int* keyOfsInOut, - const IndexSeekPoint& seekPoint, - int direction) const; - - void restorePosition(OperationContext* opCtx, - const BSONObj& savedKey, - const DiskLoc& savedLoc, - int direction, - DiskLoc* bucketInOut, - int* keyOffsetInOut) const; - - // - // Creation and deletion - // - - /** - * Returns OK if the index was uninitialized before, error status otherwise. - */ - Status initAsEmpty(OperationContext* opCtx); - - // - // Size constants - // - - const RecordStore* getRecordStore() const { - return _recordStore; - } - - SavedCursorRegistry* savedCursors() const { - return _cursorRegistry; - } - - static int lowWaterMark(); - - Ordering ordering() const { - return _ordering; - } - - int customBSONCmp(const BSONObj& inIndex_left, - const IndexSeekPoint& seekPoint_right, - int direction) const; - - bool isUnique() const { - return _isUnique; - } - -private: - friend class BtreeLogic::Builder; - - // Used for unit-testing only - friend class BtreeLogicTestBase<BtreeLayout>; - friend class ArtificialTreeBuilder<BtreeLayout>; - - /** - * This is an in memory wrapper for the variable length data associated with a - * KeyHeaderType. It points to on-disk data but is not itself on-disk data. - * - * This object and its BSONObj 'key' will become invalid if the KeyHeaderType data that owns - * this it is moved within the btree. In general, a KeyWrapper should not be expected to be - * valid after a write. - */ - struct FullKey { - FullKey(const BucketType* bucket, int i) - : header(getKeyHeader(bucket, i)), - prevChildBucket(header.prevChildBucket), - recordLoc(header.recordLoc), - data(bucket->data + header.keyDataOfs()) {} - - // This is actually a reference to something on-disk. - const KeyHeaderType& header; - - // These are actually in 'header'. - const LocType& prevChildBucket; - const LocType& recordLoc; - - // This is *not* memory-mapped but its members point to something on-disk. - KeyDataType data; - }; - - // - // Functions that depend on the templated type info but nothing in 'this'. - // - - static LocType& childLocForPos(BucketType* bucket, int pos); - - static FullKey getFullKey(const BucketType* bucket, int i); - - static KeyHeaderType& getKeyHeader(BucketType* bucket, int i); - - static const KeyHeaderType& getKeyHeader(const BucketType* bucket, int i); - - static char* dataAt(BucketType* bucket, short ofs); - - static void markUnused(BucketType* bucket, int keypos); - - static int totalDataSize(BucketType* bucket); - - static void init(BucketType* bucket); - - static int _alloc(BucketType* bucket, int bytes); - - static void _unalloc(BucketType* bucket, int bytes); - - static void _delKeyAtPos(BucketType* bucket, int keypos, bool mayEmpty = false); - - static void popBack(BucketType* bucket, DiskLoc* recordLocOut, KeyDataType* keyDataOut); - - static bool mayDropKey(BucketType* bucket, int index, int refPos); - - static int _packedDataSize(BucketType* bucket, int refPos); - - static void setPacked(BucketType* bucket); - - static void setNotPacked(BucketType* bucket); - - static BucketType* btreemod(OperationContext* opCtx, BucketType* bucket); - - static int splitPos(BucketType* bucket, int keypos); - - static void reserveKeysFront(BucketType* bucket, int nAdd); - - static void setKey(BucketType* bucket, - int i, - const DiskLoc recordLoc, - const KeyDataType& key, - const DiskLoc prevChildBucket); - - static bool isHead(BucketType* bucket); - - static void dumpBucket(const BucketType* bucket, int indentLength = 0); - - static void assertValid(const std::string& ns, - BucketType* bucket, - const Ordering& ordering, - bool force = false); - - // - // 'this'-specific helpers (require record store, catalog information, or ordering, or type - // information). - // - - bool basicInsert(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int& keypos, - const KeyDataType& key, - const DiskLoc recordLoc); - - void dropFront(BucketType* bucket, int nDrop, int& refpos); - - void _pack(OperationContext* opCtx, BucketType* bucket, const DiskLoc thisLoc, int& refPos); - - void customLocate(OperationContext* opCtx, - DiskLoc* locInOut, - int* keyOfsInOut, - const IndexSeekPoint& seekPoint, - int direction, - std::pair<DiskLoc, int>& bestParent) const; - - Status _find(OperationContext* opCtx, - BucketType* bucket, - const KeyDataType& key, - const DiskLoc& recordLoc, - bool errorIfDup, - int* keyPositionOut, - bool* foundOut) const; - - bool customFind(OperationContext* opCtx, - int low, - int high, - const IndexSeekPoint& seekPoint, - int direction, - DiskLoc* thisLocInOut, - int* keyOfsInOut, - std::pair<DiskLoc, int>& bestParent) const; - - void advanceToImpl(OperationContext* opCtx, - DiskLoc* thisLocInOut, - int* keyOfsInOut, - const IndexSeekPoint& seekPoint, - int direction) const; - - bool wouldCreateDup(OperationContext* opCtx, const KeyDataType& key, const DiskLoc self) const; - - bool keyIsUsed(OperationContext* opCtx, const DiskLoc& loc, const int& pos) const; - - void skipUnusedKeys(OperationContext* opCtx, DiskLoc* loc, int* pos, int direction) const; - - DiskLoc advance(OperationContext* opCtx, - const DiskLoc& bucketLoc, - int* posInOut, - int direction) const; - - DiskLoc _locate(OperationContext* opCtx, - const DiskLoc& bucketLoc, - const KeyDataType& key, - int* posOut, - bool* foundOut, - const DiskLoc& recordLoc, - const int direction) const; - - long long _fullValidate(OperationContext* opCtx, - const DiskLoc bucketLoc, - long long* unusedCount, - bool strict, - bool dumpBuckets, - unsigned depth) const; - - DiskLoc _addBucket(OperationContext* opCtx); - - bool canMergeChildren(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - const int leftIndex); - - // has to look in children of 'bucket' and requires record store - int _rebalancedSeparatorPos(OperationContext* opCtx, BucketType* bucket, int leftIndex); - - void _packReadyForMod(BucketType* bucket, int& refPos); - - void truncateTo(BucketType* bucket, int N, int& refPos); - - void split(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int keypos, - const DiskLoc recordLoc, - const KeyDataType& key, - const DiskLoc lchild, - const DiskLoc rchild); - - Status _insert(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - const KeyDataType& key, - const DiskLoc recordLoc, - bool dupsAllowed, - const DiskLoc leftChild, - const DiskLoc rightChild); - - // TODO take a BucketType*? - void insertHere(OperationContext* opCtx, - const DiskLoc bucketLoc, - int pos, - const KeyDataType& key, - const DiskLoc recordLoc, - const DiskLoc leftChild, - const DiskLoc rightChild); - - std::string dupKeyError(const KeyDataType& key) const; - - void setInternalKey(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int keypos, - const DiskLoc recordLoc, - const KeyDataType& key, - const DiskLoc lchild, - const DiskLoc rchild); - - void fixParentPtrs(OperationContext* trans, - BucketType* bucket, - const DiskLoc bucketLoc, - int firstIndex = 0, - int lastIndex = -1); - - bool mayBalanceWithNeighbors(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc); - - void doBalanceChildren(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int leftIndex); - - void doBalanceLeftToRight(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc thisLoc, - int leftIndex, - int split, - BucketType* l, - const DiskLoc lchild, - BucketType* r, - const DiskLoc rchild); - - void doBalanceRightToLeft(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc thisLoc, - int leftIndex, - int split, - BucketType* l, - const DiskLoc lchild, - BucketType* r, - const DiskLoc rchild); - - bool tryBalanceChildren(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int leftIndex); - - int indexInParent(OperationContext* opCtx, BucketType* bucket, const DiskLoc bucketLoc) const; - - void doMergeChildren(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int leftIndex); - - void replaceWithNextChild(OperationContext* opCtx, BucketType* bucket, const DiskLoc bucketLoc); - - void deleteInternalKey(OperationContext* opCtx, - BucketType* bucket, - const DiskLoc bucketLoc, - int keypos); - - void delKeyAtPos(OperationContext* opCtx, BucketType* bucket, const DiskLoc bucketLoc, int p); - - void delBucket(OperationContext* opCtx, BucketType* bucket, const DiskLoc bucketLoc); - - void deallocBucket(OperationContext* opCtx, BucketType* bucket, const DiskLoc bucketLoc); - - bool _keyIsAt(const BSONObj& savedKey, - const DiskLoc& savedLoc, - BucketType* bucket, - int keyPos) const; - - /** - * Tries to push key into bucket. Return false if it can't because key doesn't fit. - * - * bucket must be declared as writable by the caller. - * The new key/recordLoc pair must be higher than any others in bucket. - * - * TODO needs 'this' for _ordering for sanity check - */ - bool pushBack(BucketType* bucket, - const DiskLoc recordLoc, - const KeyDataType& key, - const DiskLoc prevChild); - - - BucketType* childForPos(OperationContext* opCtx, BucketType* bucket, int pos) const; - - BucketType* getBucket(OperationContext* opCtx, const DiskLoc dl) const { - return getBucket(opCtx, dl.toRecordId()); - } - BucketType* getBucket(OperationContext* opCtx, const RecordId dl) const; - - BucketType* getRoot(OperationContext* opCtx) const; - - DiskLoc getRootLoc(OperationContext* opCtx) const; - - void recordRandomWalk(OperationContext* opCtx, - PseudoRandom* prng, - BucketType* curBucket, - int64_t nBucketsInCurrentLevel, - std::vector<int64_t>* nKeysInLevel, - std::vector<FullKey>* selectedKeys) const; - - // - // Data - // - - // Not owned here. - HeadManager* _headManager; - - // Not owned here. - RecordStore* _recordStore; - - // Not owned Here. - SavedCursorRegistry* _cursorRegistry; - - Ordering _ordering; - - std::string _indexName; - - // True if this is a unique index, i.e. if duplicate key values are disallowed. - const bool _isUnique; -}; - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp deleted file mode 100644 index b3667b14e40..00000000000 --- a/src/mongo/db/storage/mmap_v1/btree/btree_logic_test.cpp +++ /dev/null @@ -1,2500 +0,0 @@ -// btree_logic_test.cpp : Btree unit tests -// - -/** - * Copyright (C) 2014 MongoDB - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects - * for all of the code used other than as permitted herein. If you modify - * file(s) with this exception, you may extend this exception to your - * version of the file(s), but you are not obligated to do so. If you do not - * wish to do so, delete this exception statement from your version. If you - * delete this exception statement from all source files in the program, - * then also delete it in the license file. - */ - -// This file contains simple single-threaded tests, which check various aspects of the Btree logic -// - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/db/operation_context_noop.h" -#include "mongo/db/storage/mmap_v1/btree/btree_test_help.h" -#include "mongo/unittest/unittest.h" -#include "mongo/util/log.h" - - -namespace mongo { - -using std::string; - -/** - * This class is made friend of BtreeLogic so we can add whatever private method accesses we - * need to it, to be used by the tests. - */ -template <class BtreeLayoutType> -class BtreeLogicTestBase { -public: - typedef typename BtreeLayoutType::BucketType BucketType; - typedef typename BtreeLayoutType::FixedWidthKeyType FixedWidthKeyType; - - typedef typename BtreeLogic<BtreeLayoutType>::FullKey FullKey; - typedef typename BtreeLogic<BtreeLayoutType>::KeyDataOwnedType KeyDataOwnedType; - - BtreeLogicTestBase() : _helper(BSON("TheKey" << 1)) {} - - virtual ~BtreeLogicTestBase() {} - -protected: - void checkValidNumKeys(int nKeys) { - OperationContextNoop opCtx; - ASSERT_EQUALS(nKeys, _helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - } - - Status insert(const BSONObj& key, const DiskLoc dl, bool dupsAllowed = true) { - OperationContextNoop opCtx; - return _helper.btree.insert(&opCtx, key, dl, dupsAllowed); - } - - bool unindex(const BSONObj& key) { - OperationContextNoop opCtx; - return _helper.btree.unindex(&opCtx, key, _helper.dummyDiskLoc); - } - - void locate(const BSONObj& key, - int expectedPos, - bool expectedFound, - const RecordId& expectedLocation, - int direction) { - return locate( - key, expectedPos, expectedFound, DiskLoc::fromRecordId(expectedLocation), direction); - } - void locate(const BSONObj& key, - int expectedPos, - bool expectedFound, - const DiskLoc& expectedLocation, - int direction) { - int pos; - DiskLoc loc; - OperationContextNoop opCtx; - ASSERT_EQUALS( - expectedFound, - _helper.btree.locate(&opCtx, key, _helper.dummyDiskLoc, direction, &pos, &loc)); - ASSERT_EQUALS(expectedLocation, loc); - ASSERT_EQUALS(expectedPos, pos); - } - - const BucketType* child(const BucketType* bucket, int i) const { - verify(i <= bucket->n); - - DiskLoc diskLoc; - if (i == bucket->n) { - diskLoc = bucket->nextChild; - } else { - FullKey fullKey = BtreeLogic<BtreeLayoutType>::getFullKey(bucket, i); - diskLoc = fullKey.prevChildBucket; - } - - verify(!diskLoc.isNull()); - - return _helper.btree.getBucket(NULL, diskLoc); - } - - BucketType* head() const { - OperationContextNoop opCtx; - return _helper.btree.getBucket(&opCtx, _helper.headManager.getHead(&opCtx)); - } - - void forcePackBucket(const RecordId bucketLoc) { - BucketType* bucket = _helper.btree.getBucket(NULL, bucketLoc); - - bucket->topSize += bucket->emptySize; - bucket->emptySize = 0; - BtreeLogic<BtreeLayoutType>::setNotPacked(bucket); - } - - void truncateBucket(BucketType* bucket, int N, int& refPos) { - _helper.btree.truncateTo(bucket, N, refPos); - } - - int bucketPackedDataSize(BucketType* bucket, int refPos) { - return _helper.btree._packedDataSize(bucket, refPos); - } - - int bucketRebalancedSeparatorPos(const RecordId bucketLoc, int leftIndex) { - BucketType* bucket = _helper.btree.getBucket(NULL, bucketLoc); - OperationContextNoop opCtx; - return _helper.btree._rebalancedSeparatorPos(&opCtx, bucket, leftIndex); - } - - FullKey getKey(const RecordId bucketLoc, int pos) const { - const BucketType* bucket = _helper.btree.getBucket(NULL, bucketLoc); - return BtreeLogic<BtreeLayoutType>::getFullKey(bucket, pos); - } - - void markKeyUnused(const DiskLoc bucketLoc, int keyPos) { - BucketType* bucket = _helper.btree.getBucket(NULL, bucketLoc); - invariant(keyPos >= 0 && keyPos < bucket->n); - - _helper.btree.getKeyHeader(bucket, keyPos).setUnused(); - } - - DiskLoc newBucket() { - OperationContextNoop opCtx; - return _helper.btree._addBucket(&opCtx); - } - - /** - * Sets the nextChild pointer for the bucket at the specified location. - */ - void setBucketNextChild(const DiskLoc bucketLoc, const DiskLoc nextChild) { - OperationContextNoop opCtx; - - BucketType* bucket = _helper.btree.getBucket(&opCtx, bucketLoc); - bucket->nextChild = nextChild; - - _helper.btree.fixParentPtrs(&opCtx, bucket, bucketLoc); - } - -protected: - BtreeLogicTestHelper<BtreeLayoutType> _helper; -}; - -// -// TESTS -// - -template <class OnDiskFormat> -class SimpleCreate : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - this->_helper.btree.initAsEmpty(&opCtx).transitional_ignore(); - - this->checkValidNumKeys(0); - } -}; - -template <class OnDiskFormat> -class SimpleInsertDelete : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - this->_helper.btree.initAsEmpty(&opCtx).transitional_ignore(); - - BSONObj key = simpleKey('z'); - this->insert(key, this->_helper.dummyDiskLoc).transitional_ignore(); - - this->checkValidNumKeys(1); - this->locate(key, 0, true, this->_helper.headManager.getHead(&opCtx), 1); - - this->unindex(key); - - this->checkValidNumKeys(0); - this->locate(key, 0, false, DiskLoc(), 1); - } -}; - -template <class OnDiskFormat> -class SplitUnevenBucketBase : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - this->_helper.btree.initAsEmpty(&opCtx).transitional_ignore(); - - for (int i = 0; i < 10; ++i) { - BSONObj shortKey = simpleKey(shortToken(i), 1); - this->insert(shortKey, this->_helper.dummyDiskLoc).transitional_ignore(); - - BSONObj longKey = simpleKey(longToken(i), 800); - this->insert(longKey, this->_helper.dummyDiskLoc).transitional_ignore(); - } - - this->checkValidNumKeys(20); - ASSERT_EQUALS(1, this->head()->n); - checkSplit(); - } - -protected: - virtual char shortToken(int i) const = 0; - virtual char longToken(int i) const = 0; - virtual void checkSplit() = 0; - - static char leftToken(int i) { - return 'a' + i; - } - - static char rightToken(int i) { - return 'z' - i; - } -}; - -template <class OnDiskFormat> -class SplitRightHeavyBucket : public SplitUnevenBucketBase<OnDiskFormat> { -private: - virtual char shortToken(int i) const { - return this->leftToken(i); - } - virtual char longToken(int i) const { - return this->rightToken(i); - } - virtual void checkSplit() { - ASSERT_EQUALS(15, this->child(this->head(), 0)->n); - ASSERT_EQUALS(4, this->child(this->head(), 1)->n); - } -}; - -template <class OnDiskFormat> -class SplitLeftHeavyBucket : public SplitUnevenBucketBase<OnDiskFormat> { -private: - virtual char shortToken(int i) const { - return this->rightToken(i); - } - virtual char longToken(int i) const { - return this->leftToken(i); - } - virtual void checkSplit() { - ASSERT_EQUALS(4, this->child(this->head(), 0)->n); - ASSERT_EQUALS(15, this->child(this->head(), 1)->n); - } -}; - -template <class OnDiskFormat> -class MissingLocate : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - this->_helper.btree.initAsEmpty(&opCtx).transitional_ignore(); - - for (int i = 0; i < 3; ++i) { - BSONObj k = simpleKey('b' + 2 * i); - this->insert(k, this->_helper.dummyDiskLoc).transitional_ignore(); - } - - locateExtended(1, 'a', 'b', this->_helper.headManager.getHead(&opCtx)); - locateExtended(1, 'c', 'd', this->_helper.headManager.getHead(&opCtx)); - locateExtended(1, 'e', 'f', this->_helper.headManager.getHead(&opCtx)); - locateExtended(1, 'g', 'g' + 1, RecordId()); // of course, 'h' isn't in the index. - - // old behavior - // locateExtended( -1, 'a', 'b', dl() ); - // locateExtended( -1, 'c', 'd', dl() ); - // locateExtended( -1, 'e', 'f', dl() ); - // locateExtended( -1, 'g', 'f', dl() ); - - locateExtended(-1, 'a', 'a' - 1, RecordId()); // of course, 'a' - 1 isn't in the index - locateExtended(-1, 'c', 'b', this->_helper.headManager.getHead(&opCtx)); - locateExtended(-1, 'e', 'd', this->_helper.headManager.getHead(&opCtx)); - locateExtended(-1, 'g', 'f', this->_helper.headManager.getHead(&opCtx)); - } - -private: - void locateExtended(int direction, char token, char expectedMatch, RecordId expectedLocation) { - const BSONObj k = simpleKey(token); - int expectedPos = (expectedMatch - 'b') / 2; - - this->locate(k, expectedPos, false, expectedLocation, direction); - } -}; - -template <class OnDiskFormat> -class MissingLocateMultiBucket : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - this->_helper.btree.initAsEmpty(&opCtx).transitional_ignore(); - - this->insert(simpleKey('A', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - this->insert(simpleKey('B', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - this->insert(simpleKey('C', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - this->insert(simpleKey('D', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - this->insert(simpleKey('E', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - this->insert(simpleKey('F', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - this->insert(simpleKey('G', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - this->insert(simpleKey('H', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - this->insert(simpleKey('J', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - - // This causes split - this->insert(simpleKey('I', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - - int pos; - DiskLoc loc; - - // 'E' is the split point and should be in the head the rest should be ~50/50 - const BSONObj splitPoint = simpleKey('E', 800); - this->_helper.btree.locate(&opCtx, splitPoint, this->_helper.dummyDiskLoc, 1, &pos, &loc); - ASSERT_EQUALS(this->_helper.headManager.getHead(&opCtx), loc.toRecordId()); - ASSERT_EQUALS(0, pos); - - // Find the one before 'E' - int largePos; - DiskLoc largeLoc; - this->_helper.btree.locate( - &opCtx, splitPoint, this->_helper.dummyDiskLoc, 1, &largePos, &largeLoc); - this->_helper.btree.advance(&opCtx, &largeLoc, &largePos, -1); - - // Find the one after 'E' - int smallPos; - DiskLoc smallLoc; - this->_helper.btree.locate( - &opCtx, splitPoint, this->_helper.dummyDiskLoc, 1, &smallPos, &smallLoc); - this->_helper.btree.advance(&opCtx, &smallLoc, &smallPos, 1); - - ASSERT_NOT_EQUALS(smallLoc, largeLoc); - ASSERT_NOT_EQUALS(smallLoc, loc); - ASSERT_NOT_EQUALS(largeLoc, loc); - } -}; - -/** - * Validates that adding keys incrementally produces buckets, which are 90%/10% full. - */ -template <class OnDiskFormat> -class SERVER983 : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - this->_helper.btree.initAsEmpty(&opCtx).transitional_ignore(); - - this->insert(simpleKey('A', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - this->insert(simpleKey('B', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - this->insert(simpleKey('C', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - this->insert(simpleKey('D', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - this->insert(simpleKey('E', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - this->insert(simpleKey('F', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - this->insert(simpleKey('G', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - this->insert(simpleKey('H', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - this->insert(simpleKey('I', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - - // This will cause split - this->insert(simpleKey('J', 800), this->_helper.dummyDiskLoc).transitional_ignore(); - - int pos; - DiskLoc loc; - - // 'H' is the maximum 'large' interval key, 90% should be < 'H' and 10% larger - const BSONObj splitPoint = simpleKey('H', 800); - this->_helper.btree.locate(&opCtx, splitPoint, this->_helper.dummyDiskLoc, 1, &pos, &loc); - ASSERT_EQUALS(this->_helper.headManager.getHead(&opCtx), loc.toRecordId()); - ASSERT_EQUALS(0, pos); - - // Find the one before 'H' - int largePos; - DiskLoc largeLoc; - this->_helper.btree.locate( - &opCtx, splitPoint, this->_helper.dummyDiskLoc, 1, &largePos, &largeLoc); - this->_helper.btree.advance(&opCtx, &largeLoc, &largePos, -1); - - // Find the one after 'H' - int smallPos; - DiskLoc smallLoc; - this->_helper.btree.locate( - &opCtx, splitPoint, this->_helper.dummyDiskLoc, 1, &smallPos, &smallLoc); - this->_helper.btree.advance(&opCtx, &smallLoc, &smallPos, 1); - - ASSERT_NOT_EQUALS(smallLoc, largeLoc); - ASSERT_NOT_EQUALS(smallLoc, loc); - ASSERT_NOT_EQUALS(largeLoc, loc); - } -}; - -template <class OnDiskFormat> -class DontReuseUnused : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - this->_helper.btree.initAsEmpty(&opCtx).transitional_ignore(); - - for (int i = 0; i < 10; ++i) { - const BSONObj k = simpleKey('b' + 2 * i, 800); - this->insert(k, this->_helper.dummyDiskLoc).transitional_ignore(); - } - - const BSONObj root = simpleKey('p', 800); - this->unindex(root); - - this->insert(root, this->_helper.dummyDiskLoc).transitional_ignore(); - this->locate(root, 0, true, this->head()->nextChild, 1); - } -}; - -template <class OnDiskFormat> -class MergeBucketsTestBase : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - this->_helper.btree.initAsEmpty(&opCtx).transitional_ignore(); - - for (int i = 0; i < 10; ++i) { - const BSONObj k = simpleKey('b' + 2 * i, 800); - this->insert(k, this->_helper.dummyDiskLoc).transitional_ignore(); - } - - // numRecords() - 1, because this->_helper.dummyDiskLoc is actually in the record store too - ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL) - 1); - - long long expectedCount = 10 - unindexKeys(); - ASSERT_EQUALS(1, this->_helper.recordStore.numRecords(NULL) - 1); - - long long unusedCount = 0; - ASSERT_EQUALS(expectedCount, - this->_helper.btree.fullValidate(&opCtx, &unusedCount, true, false, 0)); - ASSERT_EQUALS(0, unusedCount); - } - -protected: - virtual int unindexKeys() = 0; -}; - -template <class OnDiskFormat> -class MergeBucketsLeft : public MergeBucketsTestBase<OnDiskFormat> { - virtual int unindexKeys() { - BSONObj k = simpleKey('b', 800); - this->unindex(k); - - k = simpleKey('b' + 2, 800); - this->unindex(k); - - k = simpleKey('b' + 4, 800); - this->unindex(k); - - k = simpleKey('b' + 6, 800); - this->unindex(k); - - return 4; - } -}; - -template <class OnDiskFormat> -class MergeBucketsRight : public MergeBucketsTestBase<OnDiskFormat> { - virtual int unindexKeys() { - const BSONObj k = simpleKey('b' + 2 * 9, 800); - this->unindex(k); - return 1; - } -}; - -template <class OnDiskFormat> -class MergeBucketsDontReplaceHead : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - this->_helper.btree.initAsEmpty(&opCtx).transitional_ignore(); - - for (int i = 0; i < 18; ++i) { - const BSONObj k = simpleKey('a' + i, 800); - this->insert(k, this->_helper.dummyDiskLoc).transitional_ignore(); - } - - // numRecords(NULL) - 1, because fixedDiskLoc is actually in the record store too - ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL) - 1); - - const BSONObj k = simpleKey('a' + 17, 800); - this->unindex(k); - ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL) - 1); - - long long unusedCount = 0; - ASSERT_EQUALS(17, this->_helper.btree.fullValidate(&opCtx, &unusedCount, true, false, 0)); - ASSERT_EQUALS(0, unusedCount); - } -}; - -template <class OnDiskFormat> -class MergeBucketsDelInternal : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{d:{b:{a:null},bb:null,_:{c:null}},_:{f:{e:null},_:{g:null}}}"); - ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" - << "bb"); - verify(this->unindex(k)); - - ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure("{b:{a:null},d:{c:null},f:{e:null},_:{g:null}}"); - } -}; - -template <class OnDiskFormat> -class MergeBucketsRightNull : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{d:{b:{a:null},bb:null,cc:{c:null}},_:{f:{e:null},h:{g:null}}}"); - ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" - << "bb"); - verify(this->unindex(k)); - - ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure("{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}"); - } -}; - -// This comment was here during porting, not sure what it means: -// -// "Not yet handling this case" -template <class OnDiskFormat> -class DontMergeSingleBucket : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{d:{b:{a:null},c:null}}"); - - ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" - << "c"); - verify(this->unindex(k)); - - ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure("{d:{b:{a:null}}}"); - } -}; - -template <class OnDiskFormat> -class ParentMergeNonRightToLeft : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{d:{b:{a:null},bb:null,cc:{c:null}},i:{f:{e:null},h:{g:null}}}"); - - ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" - << "bb"); - verify(this->unindex(k)); - - ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // Child does not currently replace parent in this case. Also, the tree - // has 6 buckets + 1 for the this->_helper.dummyDiskLoc. - ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure("{i:{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}}"); - } -}; - -template <class OnDiskFormat> -class ParentMergeNonRightToRight : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{d:{b:{a:null},cc:{c:null}},i:{f:{e:null},ff:null,h:{g:null}}}"); - - ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" - << "ff"); - verify(this->unindex(k)); - - ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // Child does not currently replace parent in this case. Also, the tree - // has 6 buckets + 1 for the this->_helper.dummyDiskLoc. - ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure("{i:{b:{a:null},cc:{c:null},d:null,f:{e:null},h:{g:null}}}"); - } -}; - -template <class OnDiskFormat> -class CantMergeRightNoMerge : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree( - "{d:{b:{a:null},bb:null,cc:{c:null}}," - "dd:null," - "_:{f:{e:null},h:{g:null}}}"); - - ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" - << "bb"); - verify(this->unindex(k)); - - ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure( - "{d:{b:{a:null},cc:{c:null}}," - "dd:null," - "_:{f:{e:null},h:{g:null}}}"); - } -}; - -template <class OnDiskFormat> -class CantMergeLeftNoMerge : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{c:{b:{a:null}},d:null,_:{f:{e:null},g:null}}"); - - ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" - << "g"); - verify(this->unindex(k)); - - ASSERT_EQUALS(6, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure("{c:{b:{a:null}},d:null,_:{f:{e:null}}}"); - } -}; - -template <class OnDiskFormat> -class MergeOption : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{c:{b:{a:null}},f:{e:{d:null},ee:null},_:{h:{g:null}}}"); - - ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" - << "ee"); - verify(this->unindex(k)); - - ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure("{c:{b:{a:null}},_:{e:{d:null},f:null,h:{g:null}}}"); - } -}; - -template <class OnDiskFormat> -class ForceMergeLeft : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{c:{b:{a:null}},f:{e:{d:null},ee:null},ff:null,_:{h:{g:null}}}"); - - ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" - << "ee"); - verify(this->unindex(k)); - - ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure("{f:{b:{a:null},c:null,e:{d:null}},ff:null,_:{h:{g:null}}}"); - } -}; - -template <class OnDiskFormat> -class ForceMergeRight : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{c:{b:{a:null}},cc:null,f:{e:{d:null},ee:null},_:{h:{g:null}}}"); - - ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 7 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(8, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" - << "ee"); - verify(this->unindex(k)); - - ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure("{c:{b:{a:null}},cc:null,_:{e:{d:null},f:null,h:{g:null}}}"); - } -}; - -template <class OnDiskFormat> -class RecursiveMerge : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},g:{f:null}},j:{i:null}}"); - - ASSERT_EQUALS(10, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" - << "c"); - verify(this->unindex(k)); - - ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - - // Height is not currently reduced in this case - builder.checkStructure("{j:{g:{b:{a:null},d:null,e:null,f:null},h:null,i:null}}"); - } -}; - -template <class OnDiskFormat> -class RecursiveMergeRightBucket : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},g:{f:null}},_:{i:null}}"); - - ASSERT_EQUALS(9, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" - << "c"); - verify(this->unindex(k)); - - ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure("{g:{b:{a:null},d:null,e:null,f:null},h:null,i:null}"); - } -}; - -template <class OnDiskFormat> -class RecursiveMergeDoubleRightBucket : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{h:{e:{b:{a:null},c:null,d:null},_:{f:null}},_:{i:null}}"); - - ASSERT_EQUALS(8, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" - << "c"); - verify(this->unindex(k)); - - ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - - // no recursion currently in this case - builder.checkStructure("{h:{b:{a:null},d:null,e:null,f:null},_:{i:null}}"); - } -}; - -template <class OnDiskFormat> -class MergeSizeTestBase : public BtreeLogicTestBase<OnDiskFormat> { -public: - MergeSizeTestBase() : _count(0) {} - - void run() { - OperationContextNoop opCtx; - this->_helper.btree.initAsEmpty(&opCtx).transitional_ignore(); - - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - const BSONObj& topKey = biggestKey('m'); - - DiskLoc leftChild = this->newBucket(); - builder.push( - DiskLoc::fromRecordId(this->_helper.headManager.getHead(&opCtx)), topKey, leftChild); - _count++; - - DiskLoc rightChild = this->newBucket(); - this->setBucketNextChild(DiskLoc::fromRecordId(this->_helper.headManager.getHead(&opCtx)), - rightChild); - - _count += builder.fillBucketToExactSize(leftChild, leftSize(), 'a'); - _count += builder.fillBucketToExactSize(rightChild, rightSize(), 'n'); - - ASSERT(leftAdditional() <= 2); - if (leftAdditional() >= 2) { - builder.push(leftChild, bigKey('k'), DiskLoc()); - } - if (leftAdditional() >= 1) { - builder.push(leftChild, bigKey('l'), DiskLoc()); - } - - ASSERT(rightAdditional() <= 2); - if (rightAdditional() >= 2) { - builder.push(rightChild, bigKey('y'), DiskLoc()); - } - if (rightAdditional() >= 1) { - builder.push(rightChild, bigKey('z'), DiskLoc()); - } - - _count += leftAdditional() + rightAdditional(); - - initCheck(); - - const char* keys = delKeys(); - for (const char* i = keys; *i; ++i) { - long long unused = 0; - ASSERT_EQUALS(_count, - this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - ASSERT_EQUALS(0, unused); - - // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = bigKey(*i); - this->unindex(k); - - --_count; - } - - long long unused = 0; - ASSERT_EQUALS(_count, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - ASSERT_EQUALS(0, unused); - - validate(); - - if (!merge()) { - // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL)); - } else { - // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL)); - } - } - -protected: - virtual int leftAdditional() const { - return 2; - } - virtual int rightAdditional() const { - return 2; - } - virtual void initCheck() {} - virtual void validate() {} - virtual int leftSize() const = 0; - virtual int rightSize() const = 0; - virtual const char* delKeys() const { - return "klyz"; - } - virtual bool merge() const { - return true; - } - - static BSONObj bigKey(char a) { - return simpleKey(a, 801); - } - - static BSONObj biggestKey(char a) { - int size = OnDiskFormat::KeyMax - bigSize() + 801; - return simpleKey(a, size); - } - - static int bigSize() { - return typename BtreeLogicTestBase<OnDiskFormat>::KeyDataOwnedType(bigKey('a')).dataSize(); - } - - static int biggestSize() { - return - typename BtreeLogicTestBase<OnDiskFormat>::KeyDataOwnedType(biggestKey('a')).dataSize(); - } - - int _count; -}; - -template <class OnDiskFormat> -class MergeSizeJustRightRight : public MergeSizeTestBase<OnDiskFormat> { -protected: - virtual int rightSize() const { - return BtreeLogic<OnDiskFormat>::lowWaterMark() - 1; - } - - virtual int leftSize() const { - return OnDiskFormat::BucketBodySize - MergeSizeTestBase<OnDiskFormat>::biggestSize() - - sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) - - (BtreeLogic<OnDiskFormat>::lowWaterMark() - 1); - } -}; - -template <class OnDiskFormat> -class MergeSizeJustRightLeft : public MergeSizeTestBase<OnDiskFormat> { -protected: - virtual int leftSize() const { - return BtreeLogic<OnDiskFormat>::lowWaterMark() - 1; - } - - virtual int rightSize() const { - return OnDiskFormat::BucketBodySize - MergeSizeTestBase<OnDiskFormat>::biggestSize() - - sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) - - (BtreeLogic<OnDiskFormat>::lowWaterMark() - 1); - } - - virtual const char* delKeys() const { - return "yzkl"; - } -}; - -template <class OnDiskFormat> -class MergeSizeRight : public MergeSizeJustRightRight<OnDiskFormat> { - virtual int rightSize() const { - return MergeSizeJustRightRight<OnDiskFormat>::rightSize() - 1; - } - virtual int leftSize() const { - return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1; - } -}; - -template <class OnDiskFormat> -class MergeSizeLeft : public MergeSizeJustRightLeft<OnDiskFormat> { - virtual int rightSize() const { - return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1; - } - virtual int leftSize() const { - return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() - 1; - } -}; - -template <class OnDiskFormat> -class NoMergeBelowMarkRight : public MergeSizeJustRightRight<OnDiskFormat> { - virtual int rightSize() const { - return MergeSizeJustRightRight<OnDiskFormat>::rightSize() + 1; - } - virtual int leftSize() const { - return MergeSizeJustRightRight<OnDiskFormat>::leftSize() - 1; - } - virtual bool merge() const { - return false; - } -}; - -template <class OnDiskFormat> -class NoMergeBelowMarkLeft : public MergeSizeJustRightLeft<OnDiskFormat> { - virtual int rightSize() const { - return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() - 1; - } - virtual int leftSize() const { - return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() + 1; - } - virtual bool merge() const { - return false; - } -}; - -template <class OnDiskFormat> -class MergeSizeRightTooBig : public MergeSizeJustRightLeft<OnDiskFormat> { - virtual int rightSize() const { - return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1; - } - virtual bool merge() const { - return false; - } -}; - -template <class OnDiskFormat> -class MergeSizeLeftTooBig : public MergeSizeJustRightRight<OnDiskFormat> { - virtual int leftSize() const { - return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1; - } - virtual bool merge() const { - return false; - } -}; - -template <class OnDiskFormat> -class MergeRightEmpty : public MergeSizeTestBase<OnDiskFormat> { -protected: - virtual int rightAdditional() const { - return 1; - } - virtual int leftAdditional() const { - return 1; - } - virtual const char* delKeys() const { - return "lz"; - } - virtual int rightSize() const { - return 0; - } - virtual int leftSize() const { - return OnDiskFormat::BucketBodySize - MergeSizeTestBase<OnDiskFormat>::biggestSize() - - sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType); - } -}; - -template <class OnDiskFormat> -class MergeMinRightEmpty : public MergeSizeTestBase<OnDiskFormat> { -protected: - virtual int rightAdditional() const { - return 1; - } - virtual int leftAdditional() const { - return 0; - } - virtual const char* delKeys() const { - return "z"; - } - virtual int rightSize() const { - return 0; - } - virtual int leftSize() const { - return MergeSizeTestBase<OnDiskFormat>::bigSize() + - sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType); - } -}; - -template <class OnDiskFormat> -class MergeLeftEmpty : public MergeSizeTestBase<OnDiskFormat> { -protected: - virtual int rightAdditional() const { - return 1; - } - virtual int leftAdditional() const { - return 1; - } - virtual const char* delKeys() const { - return "zl"; - } - virtual int leftSize() const { - return 0; - } - virtual int rightSize() const { - return OnDiskFormat::BucketBodySize - MergeSizeTestBase<OnDiskFormat>::biggestSize() - - sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType); - } -}; - -template <class OnDiskFormat> -class MergeMinLeftEmpty : public MergeSizeTestBase<OnDiskFormat> { -protected: - virtual int leftAdditional() const { - return 1; - } - virtual int rightAdditional() const { - return 0; - } - virtual const char* delKeys() const { - return "l"; - } - virtual int leftSize() const { - return 0; - } - virtual int rightSize() const { - return MergeSizeTestBase<OnDiskFormat>::bigSize() + - sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType); - } -}; - -template <class OnDiskFormat> -class BalanceRightEmpty : public MergeRightEmpty<OnDiskFormat> { -protected: - virtual int leftSize() const { - return OnDiskFormat::BucketBodySize - MergeSizeTestBase<OnDiskFormat>::biggestSize() - - sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) + 1; - } - - virtual bool merge() const { - return false; - } - - virtual void initCheck() { - OperationContextNoop opCtx; - _oldTop = this->getKey(this->_helper.headManager.getHead(&opCtx), 0).data.toBson(); - } - - virtual void validate() { - OperationContextNoop opCtx; - ASSERT_BSONOBJ_NE(_oldTop, - this->getKey(this->_helper.headManager.getHead(&opCtx), 0).data.toBson()); - } - -private: - BSONObj _oldTop; -}; - -template <class OnDiskFormat> -class BalanceLeftEmpty : public MergeLeftEmpty<OnDiskFormat> { -protected: - virtual int rightSize() const { - return OnDiskFormat::BucketBodySize - MergeSizeTestBase<OnDiskFormat>::biggestSize() - - sizeof(typename BtreeLogicTestBase<OnDiskFormat>::FixedWidthKeyType) + 1; - } - - virtual bool merge() const { - return false; - } - - virtual void initCheck() { - OperationContextNoop opCtx; - _oldTop = this->getKey(this->_helper.headManager.getHead(&opCtx), 0).data.toBson(); - } - - virtual void validate() { - OperationContextNoop opCtx; - ASSERT_BSONOBJ_NE(_oldTop, - this->getKey(this->_helper.headManager.getHead(&opCtx), 0).data.toBson()); - } - -private: - BSONObj _oldTop; -}; - -template <class OnDiskFormat> -class BalanceOneLeftToRight : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree( - "{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null}," - "b:{$20:null,$30:null,$40:null,$50:null,a:null}," - "_:{c:null}}"); - - ASSERT_EQUALS(14, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" << bigNumString(0x40, 800)); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure( - "{$6:{$1:null,$2:null,$3:null,$4:null,$5:null}," - "b:{$10:null,$20:null,$30:null,$50:null,a:null}," - "_:{c:null}}"); - } -}; - -template <class OnDiskFormat> -class BalanceOneRightToLeft : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree( - "{$10:{$1:null,$2:null,$3:null,$4:null}," - "b:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null}," - "_:{c:null}}"); - - ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" << bigNumString(0x3, 800)); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure( - "{$20:{$1:null,$2:null,$4:null,$10:null}," - "b:{$30:null,$40:null,$50:null,$60:null,$70:null}," - "_:{c:null}}"); - } -}; - -template <class OnDiskFormat> -class BalanceThreeLeftToRight : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree( - "{$20:{$1:{$0:null},$3:{$2:null},$5:{$4:null},$7:{$6:null}," - "$9:{$8:null},$11:{$10:null},$13:{$12:null},_:{$14:null}}," - "b:{$30:null,$40:{$35:null},$50:{$45:null}}," - "_:{c:null}}"); - - ASSERT_EQUALS(23, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 14 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(15, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" << bigNumString(0x30, 800)); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(22, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 14 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(15, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure( - "{$9:{$1:{$0:null},$3:{$2:null}," - "$5:{$4:null},$7:{$6:null},_:{$8:null}}," - "b:{$11:{$10:null},$13:{$12:null},$20:{$14:null}," - "$40:{$35:null},$50:{$45:null}}," - "_:{c:null}}"); - } -}; - -template <class OnDiskFormat> -class BalanceThreeRightToLeft : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree( - "{$20:{$1:{$0:null},$3:{$2:null},$5:null,_:{$14:null}}," - "b:{$30:{$25:null},$40:{$35:null},$50:{$45:null},$60:{$55:null}," - "$70:{$65:null},$80:{$75:null}," - "$90:{$85:null},$100:{$95:null}}," - "_:{c:null}}"); - - ASSERT_EQUALS(25, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 15 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(16, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" << bigNumString(0x5, 800)); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(24, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 15 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(16, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure( - "{$50:{$1:{$0:null},$3:{$2:null},$20:{$14:null}," - "$30:{$25:null},$40:{$35:null},_:{$45:null}}," - "b:{$60:{$55:null},$70:{$65:null},$80:{$75:null}," - "$90:{$85:null},$100:{$95:null}}," - "_:{c:null}}"); - } -}; - -template <class OnDiskFormat> -class BalanceSingleParentKey : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree( - "{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null}," - "_:{$20:null,$30:null,$40:null,$50:null,a:null}}"); - - ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" << bigNumString(0x40, 800)); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure( - "{$6:{$1:null,$2:null,$3:null,$4:null,$5:null}," - "_:{$10:null,$20:null,$30:null,$50:null,a:null}}"); - } -}; - -template <class OnDiskFormat> -class PackEmptyBucket : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{a:null}"); - - const BSONObj k = BSON("" - << "a"); - ASSERT(this->unindex(k)); - - this->forcePackBucket(this->_helper.headManager.getHead(&opCtx)); - - typename BtreeLogicTestBase<OnDiskFormat>::BucketType* headBucket = this->head(); - - ASSERT_EQUALS(0, headBucket->n); - ASSERT_FALSE(headBucket->flags & Packed); - - int unused = 0; - this->truncateBucket(headBucket, 0, unused); - - ASSERT_EQUALS(0, headBucket->n); - ASSERT_EQUALS(0, headBucket->topSize); - ASSERT_EQUALS((int)OnDiskFormat::BucketBodySize, headBucket->emptySize); - ASSERT_TRUE(headBucket->flags & Packed); - } -}; - -template <class OnDiskFormat> -class PackedDataSizeEmptyBucket : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{a:null}"); - - const BSONObj k = BSON("" - << "a"); - ASSERT(this->unindex(k)); - - this->forcePackBucket(this->_helper.headManager.getHead(&opCtx)); - - typename BtreeLogicTestBase<OnDiskFormat>::BucketType* headBucket = this->head(); - - ASSERT_EQUALS(0, headBucket->n); - ASSERT_FALSE(headBucket->flags & Packed); - ASSERT_EQUALS(0, this->bucketPackedDataSize(headBucket, 0)); - ASSERT_FALSE(headBucket->flags & Packed); - } -}; - -template <class OnDiskFormat> -class BalanceSingleParentKeyPackParent : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree( - "{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null}," - "_:{$20:null,$30:null,$40:null,$50:null,a:null}}"); - - ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL)); - - // force parent pack - this->forcePackBucket(this->_helper.headManager.getHead(&opCtx)); - - const BSONObj k = BSON("" << bigNumString(0x40, 800)); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(11, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure( - "{$6:{$1:null,$2:null,$3:null,$4:null,$5:null}," - "_:{$10:null,$20:null,$30:null,$50:null,a:null}}"); - } -}; - -template <class OnDiskFormat> -class BalanceSplitParent : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree( - "{$10$10:{$1:null,$2:null,$3:null,$4:null}," - "$100:{$20:null,$30:null,$40:null,$50:null,$60:null,$70:null,$80:null}," - "$200:null,$300:null,$400:null,$500:null,$600:null," - "$700:null,$800:null,$900:null,_:{c:null}}"); - - ASSERT_EQUALS(22, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" << bigNumString(0x3, 800)); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(21, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 6 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(7, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure( - "{$500:{ $30:{$1:null,$2:null,$4:null,$10$10:null,$20:null}," - "$100:{$40:null,$50:null,$60:null,$70:null,$80:null}," - "$200:null,$300:null,$400:null}," - "_:{$600:null,$700:null,$800:null,$900:null,_:{c:null}}}"); - } -}; - -template <class OnDiskFormat> -class RebalancedSeparatorBase : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree(treeSpec()); - modTree(); - - ASSERT_EQUALS( - expectedSeparator(), - this->bucketRebalancedSeparatorPos(this->_helper.headManager.getHead(&opCtx), 0)); - } - - virtual string treeSpec() const = 0; - virtual int expectedSeparator() const = 0; - virtual void modTree() {} -}; - -template <class OnDiskFormat> -class EvenRebalanceLeft : public RebalancedSeparatorBase<OnDiskFormat> { - virtual string treeSpec() const { - return "{$7:{$1:null,$2$31f:null,$3:null," - "$4$31f:null,$5:null,$6:null}," - "_:{$8:null,$9:null,$10$31e:null}}"; - } - virtual int expectedSeparator() const { - return 4; - } -}; - -template <class OnDiskFormat> -class EvenRebalanceLeftCusp : public RebalancedSeparatorBase<OnDiskFormat> { - virtual string treeSpec() const { - return "{$6:{$1:null,$2$31f:null,$3:null,$4$31f:null,$5:null}," - "_:{$7:null,$8:null,$9$31e:null,$10:null}}"; - } - virtual int expectedSeparator() const { - return 4; - } -}; - -template <class OnDiskFormat> -class EvenRebalanceRight : public RebalancedSeparatorBase<OnDiskFormat> { - virtual string treeSpec() const { - return "{$3:{$1:null,$2$31f:null},_:{$4$31f:null,$5:null,$6:null,$7:null,$8$31e:null,$9:" - "null,$10:null}}"; - } - virtual int expectedSeparator() const { - return 4; - } -}; - -template <class OnDiskFormat> -class EvenRebalanceRightCusp : public RebalancedSeparatorBase<OnDiskFormat> { - virtual string treeSpec() const { - return "{$4$31f:{$1:null,$2$31f:null,$3:null},_:{$5:null,$6:null,$7$31e:null,$8:null,$9:" - "null,$10:null}}"; - } - virtual int expectedSeparator() const { - return 4; - } -}; - -template <class OnDiskFormat> -class EvenRebalanceCenter : public RebalancedSeparatorBase<OnDiskFormat> { - virtual string treeSpec() const { - return "{$5:{$1:null,$2$31f:null,$3:null,$4$31f:null},_:{$6:null,$7$31e:null,$8:null,$9:" - "null,$10:null}}"; - } - virtual int expectedSeparator() const { - return 4; - } -}; - -template <class OnDiskFormat> -class OddRebalanceLeft : public RebalancedSeparatorBase<OnDiskFormat> { - virtual string treeSpec() const { - return "{$6$31f:{$1:null,$2:null,$3:null,$4:null,$5:null},_:{$7:null,$8:null,$9:null,$10:" - "null}}"; - } - virtual int expectedSeparator() const { - return 4; - } -}; - -template <class OnDiskFormat> -class OddRebalanceRight : public RebalancedSeparatorBase<OnDiskFormat> { - virtual string treeSpec() const { - return "{$4:{$1:null,$2:null,$3:null},_:{$5:null,$6:null,$7:null,$8$31f:null,$9:null,$10:" - "null}}"; - } - virtual int expectedSeparator() const { - return 4; - } -}; - -template <class OnDiskFormat> -class OddRebalanceCenter : public RebalancedSeparatorBase<OnDiskFormat> { - virtual string treeSpec() const { - return "{$5:{$1:null,$2:null,$3:null,$4:null},_:{$6:null,$7:null,$8:null,$9:null,$10$31f:" - "null}}"; - } - virtual int expectedSeparator() const { - return 4; - } -}; - -template <class OnDiskFormat> -class RebalanceEmptyRight : public RebalancedSeparatorBase<OnDiskFormat> { - virtual string treeSpec() const { - return "{$a:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null,$7:null,$8:null,$9:null},_:{$" - "b:null}}"; - } - virtual void modTree() { - BSONObj k = BSON("" << bigNumString(0xb, 800)); - ASSERT(this->unindex(k)); - } - virtual int expectedSeparator() const { - return 4; - } -}; - -template <class OnDiskFormat> -class RebalanceEmptyLeft : public RebalancedSeparatorBase<OnDiskFormat> { - virtual string treeSpec() const { - return "{$a:{$1:null},_:{$11:null,$12:null,$13:null,$14:null,$15:null,$16:null,$17:null,$" - "18:null,$19:null}}"; - } - virtual void modTree() { - BSONObj k = BSON("" << bigNumString(0x1, 800)); - ASSERT(this->unindex(k)); - } - virtual int expectedSeparator() const { - return 4; - } -}; - -template <class OnDiskFormat> -class NoMoveAtLowWaterMarkRight : public MergeSizeJustRightRight<OnDiskFormat> { - virtual int rightSize() const { - return MergeSizeJustRightRight<OnDiskFormat>::rightSize() + 1; - } - - virtual void initCheck() { - OperationContextNoop opCtx; - _oldTop = this->getKey(this->_helper.headManager.getHead(&opCtx), 0).data.toBson(); - } - - virtual void validate() { - OperationContextNoop opCtx; - ASSERT_BSONOBJ_EQ(_oldTop, - this->getKey(this->_helper.headManager.getHead(&opCtx), 0).data.toBson()); - } - - virtual bool merge() const { - return false; - } - -protected: - BSONObj _oldTop; -}; - -template <class OnDiskFormat> -class MoveBelowLowWaterMarkRight : public NoMoveAtLowWaterMarkRight<OnDiskFormat> { - virtual int rightSize() const { - return MergeSizeJustRightRight<OnDiskFormat>::rightSize(); - } - virtual int leftSize() const { - return MergeSizeJustRightRight<OnDiskFormat>::leftSize() + 1; - } - - virtual void validate() { - OperationContextNoop opCtx; - // Different top means we rebalanced - ASSERT_BSONOBJ_NE(this->_oldTop, - this->getKey(this->_helper.headManager.getHead(&opCtx), 0).data.toBson()); - } -}; - -template <class OnDiskFormat> -class NoMoveAtLowWaterMarkLeft : public MergeSizeJustRightLeft<OnDiskFormat> { - virtual int leftSize() const { - return MergeSizeJustRightLeft<OnDiskFormat>::leftSize() + 1; - } - virtual void initCheck() { - OperationContextNoop opCtx; - this->_oldTop = this->getKey(this->_helper.headManager.getHead(&opCtx), 0).data.toBson(); - } - - virtual void validate() { - OperationContextNoop opCtx; - ASSERT_BSONOBJ_EQ(this->_oldTop, - this->getKey(this->_helper.headManager.getHead(&opCtx), 0).data.toBson()); - } - virtual bool merge() const { - return false; - } - -protected: - BSONObj _oldTop; -}; - -template <class OnDiskFormat> -class MoveBelowLowWaterMarkLeft : public NoMoveAtLowWaterMarkLeft<OnDiskFormat> { - virtual int leftSize() const { - return MergeSizeJustRightLeft<OnDiskFormat>::leftSize(); - } - virtual int rightSize() const { - return MergeSizeJustRightLeft<OnDiskFormat>::rightSize() + 1; - } - - virtual void validate() { - OperationContextNoop opCtx; - // Different top means we rebalanced - ASSERT_BSONOBJ_NE(this->_oldTop, - this->getKey(this->_helper.headManager.getHead(&opCtx), 0).data.toBson()); - } -}; - -template <class OnDiskFormat> -class PreferBalanceLeft : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree( - "{$10:{$1:null,$2:null,$3:null,$4:null,$5:null,$6:null}," - "$20:{$11:null,$12:null,$13:null,$14:null}," - "_:{$30:null}}"); - - ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" << bigNumString(0x12, 800)); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure( - "{$5:{$1:null,$2:null,$3:null,$4:null}," - "$20:{$6:null,$10:null,$11:null,$13:null,$14:null}," - "_:{$30:null}}"); - } -}; - -template <class OnDiskFormat> -class PreferBalanceRight : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree( - "{$10:{$1:null}," - "$20:{$11:null,$12:null,$13:null,$14:null}," - "_:{$31:null,$32:null,$33:null,$34:null,$35:null,$36:null}}"); - - ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" << bigNumString(0x12, 800)); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure( - "{$10:{$1:null}," - "$31:{$11:null,$13:null,$14:null,$20:null}," - "_:{$32:null,$33:null,$34:null,$35:null,$36:null}}"); - } -}; - -template <class OnDiskFormat> -class RecursiveMergeThenBalance : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree( - "{$10:{$5:{$1:null,$2:null},$8:{$6:null,$7:null}}," - "_:{$20:null,$30:null,$40:null,$50:null," - "$60:null,$70:null,$80:null,$90:null}}"); - - ASSERT_EQUALS(15, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" << bigNumString(0x7, 800)); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(14, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure( - "{$40:{$8:{$1:null,$2:null,$5:null,$6:null},$10:null,$20:null,$30:null}," - "_:{$50:null,$60:null,$70:null,$80:null,$90:null}}"); - } -}; - -template <class OnDiskFormat> -class DelEmptyNoNeighbors : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{b:{a:null}}"); - - ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" - << "a"); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(1, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure("{b:null}"); - } -}; - -template <class OnDiskFormat> -class DelEmptyEmptyNeighbors : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{a:null,c:{b:null},d:null}"); - - ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL)); - - const BSONObj k = BSON("" - << "b"); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&opCtx, NULL, true, false, 0)); - - // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL)); - - builder.checkStructure("{a:null,c:null,d:null}"); - } -}; - -template <class OnDiskFormat> -class DelInternal : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{a:null,c:{b:null},d:null}"); - - long long unused = 0; - ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(0, unused); - - const BSONObj k = BSON("" - << "c"); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(0, unused); - - builder.checkStructure("{a:null,b:null,d:null}"); - } -}; - -template <class OnDiskFormat> -class DelInternalReplaceWithUnused : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{a:null,c:{b:null},d:null}"); - - const DiskLoc prevChildBucket = - this->getKey(this->_helper.headManager.getHead(&opCtx), 1).prevChildBucket; - this->markKeyUnused(prevChildBucket, 0); - - long long unused = 0; - ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(1, unused); - - const BSONObj k = BSON("" - << "c"); - ASSERT(this->unindex(k)); - - unused = 0; - ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(1, unused); - - // doesn't discriminate between used and unused - builder.checkStructure("{a:null,b:null,d:null}"); - } -}; - -template <class OnDiskFormat> -class DelInternalReplaceRight : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{a:null,_:{b:null}}"); - - long long unused = 0; - ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(0, unused); - - const BSONObj k = BSON("" - << "a"); - ASSERT(this->unindex(k)); - - unused = 0; - ASSERT_EQUALS(1, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 1 bucket + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(0, unused); - - builder.checkStructure("{b:null}"); - } -}; - -template <class OnDiskFormat> -class DelInternalPromoteKey : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{a:null,y:{d:{c:{b:null}},_:{e:null}},z:null}"); - - long long unused = 0; - ASSERT_EQUALS(7, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 5 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(6, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(0, unused); - - const BSONObj k = BSON("" - << "y"); - ASSERT(this->unindex(k)); - - unused = 0; - ASSERT_EQUALS(6, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(0, unused); - - builder.checkStructure("{a:null,e:{c:{b:null},d:null},z:null}"); - } -}; - -template <class OnDiskFormat> -class DelInternalPromoteRightKey : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{a:null,_:{e:{c:null},_:{f:null}}}"); - - long long unused = 0; - ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(0, unused); - - const BSONObj k = BSON("" - << "a"); - ASSERT(this->unindex(k)); - - unused = 0; - ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 2 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(0, unused); - - builder.checkStructure("{c:null,_:{e:null,f:null}}"); - } -}; - -template <class OnDiskFormat> -class DelInternalReplacementPrevNonNull : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{a:null,d:{c:{b:null}},e:null}"); - - long long unused = 0; - ASSERT_EQUALS(5, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(0, unused); - - const BSONObj k = BSON("" - << "d"); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(4, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(1, unused); - - builder.checkStructure("{a:null,d:{c:{b:null}},e:null}"); - - // Check 'unused' key - ASSERT(this->getKey(this->_helper.headManager.getHead(&opCtx), 1).recordLoc.getOfs() & 1); - } -}; - -template <class OnDiskFormat> -class DelInternalReplacementNextNonNull : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree("{a:null,_:{c:null,_:{d:null}}}"); - - long long unused = 0; - ASSERT_EQUALS(3, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(0, unused); - - const BSONObj k = BSON("" - << "a"); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(2, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 3 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(4, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(1, unused); - - builder.checkStructure("{a:null,_:{c:null,_:{d:null}}}"); - - // Check 'unused' key - ASSERT(this->getKey(this->_helper.headManager.getHead(&opCtx), 0).recordLoc.getOfs() & 1); - } -}; - -template <class OnDiskFormat> -class DelInternalSplitPromoteLeft : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree( - "{$10:null,$20:null,$30$10:{$25:{$23:null},_:{$27:null}}," - "$40:null,$50:null,$60:null,$70:null,$80:null,$90:null,$100:null}"); - - long long unused = 0; - ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(0, unused); - - const BSONObj k = BSON("" << bigNumString(0x30, 0x10)); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(0, unused); - - builder.checkStructure( - "{$60:{$10:null,$20:null," - "$27:{$23:null,$25:null},$40:null,$50:null}," - "_:{$70:null,$80:null,$90:null,$100:null}}"); - } -}; - -template <class OnDiskFormat> -class DelInternalSplitPromoteRight : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - ArtificialTreeBuilder<OnDiskFormat> builder(&opCtx, &this->_helper); - - builder.makeTree( - "{$10:null,$20:null,$30:null,$40:null,$50:null,$60:null,$70:null," - "$80:null,$90:null,$100$10:{$95:{$93:null},_:{$97:null}}}"); - - long long unused = 0; - ASSERT_EQUALS(13, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(0, unused); - - const BSONObj k = BSON("" << bigNumString(0x100, 0x10)); - ASSERT(this->unindex(k)); - - ASSERT_EQUALS(12, this->_helper.btree.fullValidate(&opCtx, &unused, true, false, 0)); - - // The tree has 4 buckets + 1 for the this->_helper.dummyDiskLoc - ASSERT_EQUALS(5, this->_helper.recordStore.numRecords(NULL)); - ASSERT_EQUALS(0, unused); - - builder.checkStructure( - "{$80:{$10:null,$20:null,$30:null,$40:null,$50:null,$60:null,$70:null}," - "_:{$90:null,$97:{$93:null,$95:null}}}"); - } -}; - -template <class OnDiskFormat> -class LocateEmptyForward : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - this->_helper.btree.initAsEmpty(&opCtx).transitional_ignore(); - - BSONObj key1 = simpleKey('a'); - this->insert(key1, this->_helper.dummyDiskLoc).transitional_ignore(); - BSONObj key2 = simpleKey('b'); - this->insert(key2, this->_helper.dummyDiskLoc).transitional_ignore(); - BSONObj key3 = simpleKey('c'); - this->insert(key3, this->_helper.dummyDiskLoc).transitional_ignore(); - - this->checkValidNumKeys(3); - this->locate(BSONObj(), 0, false, this->_helper.headManager.getHead(&opCtx), 1); - } -}; - -template <class OnDiskFormat> -class LocateEmptyReverse : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - this->_helper.btree.initAsEmpty(&opCtx).transitional_ignore(); - - BSONObj key1 = simpleKey('a'); - this->insert(key1, this->_helper.dummyDiskLoc).transitional_ignore(); - BSONObj key2 = simpleKey('b'); - this->insert(key2, this->_helper.dummyDiskLoc).transitional_ignore(); - BSONObj key3 = simpleKey('c'); - this->insert(key3, this->_helper.dummyDiskLoc).transitional_ignore(); - - this->checkValidNumKeys(3); - this->locate(BSONObj(), -1, false, DiskLoc(), -1); - } -}; - -template <class OnDiskFormat> -class DuplicateKeys : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - OperationContextNoop opCtx; - this->_helper.btree.initAsEmpty(&opCtx).transitional_ignore(); - - BSONObj key1 = simpleKey('z'); - ASSERT_OK(this->insert(key1, this->_helper.dummyDiskLoc, true)); - this->checkValidNumKeys(1); - this->locate(key1, 0, true, this->_helper.headManager.getHead(&opCtx), 1); - - // Attempt to insert a dup key/value, which is okay. - ASSERT_EQUALS(Status::OK(), this->insert(key1, this->_helper.dummyDiskLoc, true)); - this->checkValidNumKeys(1); - this->locate(key1, 0, true, this->_helper.headManager.getHead(&opCtx), 1); - - // Attempt to insert a dup key/value with dupsAllowed=false. - ASSERT_EQUALS(ErrorCodes::DuplicateKeyValue, - this->insert(key1, this->_helper.dummyDiskLoc, false)); - this->checkValidNumKeys(1); - this->locate(key1, 0, true, this->_helper.headManager.getHead(&opCtx), 1); - - // Add another record to produce another diskloc. - StatusWith<RecordId> s = - this->_helper.recordStore.insertRecord(&opCtx, "a", 1, Timestamp(), false); - - ASSERT_TRUE(s.isOK()); - ASSERT_EQUALS(3, this->_helper.recordStore.numRecords(NULL)); - - const DiskLoc dummyDiskLoc2 = DiskLoc::fromRecordId(s.getValue()); - - // Attempt to insert a dup key but this time with a different value. - ASSERT_EQUALS(ErrorCodes::DuplicateKey, this->insert(key1, dummyDiskLoc2, false)); - this->checkValidNumKeys(1); - - // Insert a dup key with dupsAllowed=true, should succeed. - ASSERT_OK(this->insert(key1, dummyDiskLoc2, true)); - this->checkValidNumKeys(2); - - // Clean up. - this->_helper.recordStore.deleteRecord(&opCtx, s.getValue()); - ASSERT_EQUALS(2, this->_helper.recordStore.numRecords(NULL)); - } -}; - - -/* This test requires the entire server to be linked-in and it is better implemented using - the JS framework. Disabling here and will put in jsCore. - -template<class OnDiskFormat> -class SignedZeroDuplication : public BtreeLogicTestBase<OnDiskFormat> { -public: - void run() { - ASSERT_EQUALS(0.0, -0.0); - DBDirectClient c; - - static const string ns("unittests.SignedZeroDuplication"); - - c.ensureIndex(ns, BSON("b" << 1), true); - c.insert(ns, BSON("b" << 0.0)); - c.insert(ns, BSON("b" << 1.0)); - c.update(ns, BSON("b" << 1.0), BSON("b" << -0.0)); - - ASSERT_EQUALS(1U, c.count(ns, BSON("b" << 0.0))); - } -}; -*/ - -/* -// QUERY_MIGRATION: port later - class PackUnused : public Base { - public: - void run() { - for ( long long i = 0; i < 1000000; i += 1000 ) { - insert( i ); - } - string orig, after; - { - stringstream ss; - bt()->shape( ss ); - orig = ss.str(); - } - vector< string > toDel; - vector< string > other; - BSONObjBuilder start; - start.appendMinKey( "a" ); - BSONObjBuilder end; - end.appendMaxKey( "a" ); - unique_ptr< BtreeCursor > c( BtreeCursor::make( nsdetails( ns() ), - id(), - start.done(), - end.done(), - false, - 1 ) ); - while( c->ok() ) { - bool has_child = - c->getBucket().btree()->keyNode(c->getKeyOfs()).prevChildBucket.isNull(); - - if (has_child) { - toDel.push_back( c->currKey().firstElement().valuestr() ); - } - else { - other.push_back( c->currKey().firstElement().valuestr() ); - } - c->advance(); - } - ASSERT( toDel.size() > 0 ); - for( vector< string >::const_iterator i = toDel.begin(); i != toDel.end(); ++i ) { - BSONObj o = BSON( "a" << *i ); - this->unindex( o ); - } - ASSERT( other.size() > 0 ); - for( vector< string >::const_iterator i = other.begin(); i != other.end(); ++i ) { - BSONObj o = BSON( "a" << *i ); - this->unindex( o ); - } - - long long unused = 0; - ASSERT_EQUALS( 0, bt()->fullValidate(&opCtx, dl(), order(), &unused, true ) ); - - for ( long long i = 50000; i < 50100; ++i ) { - insert( i ); - } - - long long unused2 = 0; - ASSERT_EQUALS( 100, bt()->fullValidate(&opCtx, dl(), order(), &unused2, true ) ); - -// log() << "old unused: " << unused << ", new unused: " << unused2 << endl; -// - ASSERT( unused2 <= unused ); - } - protected: - void insert( long long n ) { - string val = bigNumString( n ); - BSONObj k = BSON( "a" << val ); - Base::insert( k ); - } - }; - - class DontDropReferenceKey : public PackUnused { - public: - void run() { - // with 80 root node is full - for ( long long i = 0; i < 80; i += 1 ) { - insert( i ); - } - - BSONObjBuilder start; - start.appendMinKey( "a" ); - BSONObjBuilder end; - end.appendMaxKey( "a" ); - BSONObj l = bt()->keyNode( 0 ).key.toBson(); - string toInsert; - unique_ptr< BtreeCursor > c( BtreeCursor::make( nsdetails( ns() ), - id(), - start.done(), - end.done(), - false, - 1 ) ); - while( c->ok() ) { - if ( c->currKey().woCompare( l ) > 0 ) { - toInsert = c->currKey().firstElement().valuestr(); - break; - } - c->advance(); - } - // too much work to try to make this happen through inserts and deletes - // we are intentionally manipulating the btree bucket directly here - BtreeBucket::Loc* L = const_cast< BtreeBucket::Loc* >( - &bt()->keyNode( 1 ).prevChildBucket ); - writing(L)->Null(); - writingInt( const_cast< BtreeBucket::Loc& >( - bt()->keyNode( 1 ).recordLoc ).GETOFS() ) |= 1; // make unused - BSONObj k = BSON( "a" << toInsert ); - Base::insert( k ); - } - }; - */ - -// -// TEST SUITE DEFINITION -// - -template <class OnDiskFormat> -class BtreeLogicTestSuite : public unittest::Suite { -public: - BtreeLogicTestSuite(const std::string& name) : Suite(name) {} - - void setupTests() { - add<SimpleCreate<OnDiskFormat>>(); - add<SimpleInsertDelete<OnDiskFormat>>(); - add<SplitRightHeavyBucket<OnDiskFormat>>(); - add<SplitLeftHeavyBucket<OnDiskFormat>>(); - add<MissingLocate<OnDiskFormat>>(); - add<MissingLocateMultiBucket<OnDiskFormat>>(); - add<SERVER983<OnDiskFormat>>(); - add<DontReuseUnused<OnDiskFormat>>(); - add<MergeBucketsLeft<OnDiskFormat>>(); - add<MergeBucketsRight<OnDiskFormat>>(); - add<MergeBucketsDontReplaceHead<OnDiskFormat>>(); - add<MergeBucketsDelInternal<OnDiskFormat>>(); - add<MergeBucketsRightNull<OnDiskFormat>>(); - add<DontMergeSingleBucket<OnDiskFormat>>(); - add<ParentMergeNonRightToLeft<OnDiskFormat>>(); - add<ParentMergeNonRightToRight<OnDiskFormat>>(); - add<CantMergeRightNoMerge<OnDiskFormat>>(); - add<CantMergeLeftNoMerge<OnDiskFormat>>(); - add<MergeOption<OnDiskFormat>>(); - add<ForceMergeLeft<OnDiskFormat>>(); - add<ForceMergeRight<OnDiskFormat>>(); - add<RecursiveMerge<OnDiskFormat>>(); - add<RecursiveMergeRightBucket<OnDiskFormat>>(); - add<RecursiveMergeDoubleRightBucket<OnDiskFormat>>(); - - add<MergeSizeJustRightRight<OnDiskFormat>>(); - add<MergeSizeJustRightLeft<OnDiskFormat>>(); - add<MergeSizeRight<OnDiskFormat>>(); - add<MergeSizeLeft<OnDiskFormat>>(); - add<NoMergeBelowMarkRight<OnDiskFormat>>(); - add<NoMergeBelowMarkLeft<OnDiskFormat>>(); - add<MergeSizeRightTooBig<OnDiskFormat>>(); - add<MergeSizeLeftTooBig<OnDiskFormat>>(); - add<MergeRightEmpty<OnDiskFormat>>(); - add<MergeMinRightEmpty<OnDiskFormat>>(); - add<MergeLeftEmpty<OnDiskFormat>>(); - add<MergeMinLeftEmpty<OnDiskFormat>>(); - add<BalanceRightEmpty<OnDiskFormat>>(); - add<BalanceLeftEmpty<OnDiskFormat>>(); - - add<BalanceOneLeftToRight<OnDiskFormat>>(); - add<BalanceOneRightToLeft<OnDiskFormat>>(); - add<BalanceThreeLeftToRight<OnDiskFormat>>(); - add<BalanceThreeRightToLeft<OnDiskFormat>>(); - add<BalanceSingleParentKey<OnDiskFormat>>(); - - add<PackEmptyBucket<OnDiskFormat>>(); - add<PackedDataSizeEmptyBucket<OnDiskFormat>>(); - - add<BalanceSingleParentKeyPackParent<OnDiskFormat>>(); - add<BalanceSplitParent<OnDiskFormat>>(); - add<EvenRebalanceLeft<OnDiskFormat>>(); - add<EvenRebalanceLeftCusp<OnDiskFormat>>(); - add<EvenRebalanceRight<OnDiskFormat>>(); - add<EvenRebalanceRightCusp<OnDiskFormat>>(); - add<EvenRebalanceCenter<OnDiskFormat>>(); - add<OddRebalanceLeft<OnDiskFormat>>(); - add<OddRebalanceRight<OnDiskFormat>>(); - add<OddRebalanceCenter<OnDiskFormat>>(); - add<RebalanceEmptyRight<OnDiskFormat>>(); - add<RebalanceEmptyLeft<OnDiskFormat>>(); - - add<NoMoveAtLowWaterMarkRight<OnDiskFormat>>(); - add<MoveBelowLowWaterMarkRight<OnDiskFormat>>(); - add<NoMoveAtLowWaterMarkLeft<OnDiskFormat>>(); - add<MoveBelowLowWaterMarkLeft<OnDiskFormat>>(); - - add<PreferBalanceLeft<OnDiskFormat>>(); - add<PreferBalanceRight<OnDiskFormat>>(); - add<RecursiveMergeThenBalance<OnDiskFormat>>(); - add<DelEmptyNoNeighbors<OnDiskFormat>>(); - add<DelEmptyEmptyNeighbors<OnDiskFormat>>(); - add<DelInternal<OnDiskFormat>>(); - add<DelInternalReplaceWithUnused<OnDiskFormat>>(); - add<DelInternalReplaceRight<OnDiskFormat>>(); - add<DelInternalPromoteKey<OnDiskFormat>>(); - add<DelInternalPromoteRightKey<OnDiskFormat>>(); - add<DelInternalReplacementPrevNonNull<OnDiskFormat>>(); - add<DelInternalReplacementNextNonNull<OnDiskFormat>>(); - add<DelInternalSplitPromoteLeft<OnDiskFormat>>(); - add<DelInternalSplitPromoteRight<OnDiskFormat>>(); - - add<LocateEmptyForward<OnDiskFormat>>(); - add<LocateEmptyReverse<OnDiskFormat>>(); - - add<DuplicateKeys<OnDiskFormat>>(); - } -}; - -// Test suite for both V0 and V1 -static unittest::SuiteInstance<BtreeLogicTestSuite<BtreeLayoutV0>> SUITE_V0("BTreeLogicTests_V0"); - -static unittest::SuiteInstance<BtreeLogicTestSuite<BtreeLayoutV1>> SUITE_V1("BTreeLogicTests_V1"); -} diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.cpp deleted file mode 100644 index 91b7141e7ed..00000000000 --- a/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.cpp +++ /dev/null @@ -1,59 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/btree/btree_ondisk.h" - -#include "mongo/util/assert_util.h" -#include "mongo/util/log.h" - -namespace mongo { - -void DiskLoc56Bit::operator=(const DiskLoc& loc) { - ofs = loc.getOfs(); - int la = loc.a(); - if (la == DiskLoc::max().a()) { - invariant(ofs == DiskLoc::max().getOfs()); - la = OurMaxA; - } - invariant(la <= OurMaxA); // must fit in 3 bytes - if (la < 0) { - if (la != -1) { - log() << "btree diskloc isn't negative 1: " << la << std::endl; - invariant(la == -1); - } - la = 0; - ofs = OurNullOfs; - } - memcpy(_a, &la, 3); // endian -} - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h b/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h deleted file mode 100644 index 9be2f947772..00000000000 --- a/src/mongo/db/storage/mmap_v1/btree/btree_ondisk.h +++ /dev/null @@ -1,377 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/base/static_assert.h" -#include "mongo/db/jsobj.h" -#include "mongo/db/storage/mmap_v1/btree/key.h" -#include "mongo/db/storage/mmap_v1/diskloc.h" - -namespace mongo { - -const int OldBucketSize = 8192; - -// -// On-disk index format -// - -#pragma pack(1) -/** - * This is the fixed width data component for storage of a key within a bucket. It contains an - * offset pointer to the variable width bson data component. This may be 'unused', please see - * below. - * - * Why is this templated on Loc? Because V0 and V1 have different size DiskLoc(s) but otherwise - * the same layout. - */ -template <class LocType> -struct FixedWidthKey { - // - // Data - // - - /** - * The 'left' child bucket of this key. If this is the i-th key, it points to the i index - * child bucket. - */ - LocType prevChildBucket; - - /** - * The location of the record associated with this key. - */ - LocType recordLoc; - - /** - * Offset within current bucket of the variable width bson key for this _KeyNode. - */ - unsigned short _kdo; - - // - // Accessors / mutators - // - - short keyDataOfs() const { - return static_cast<short>(_kdo); - } - - void setKeyDataOfs(short s) { - _kdo = s; - invariant(s >= 0); - } - - void setKeyDataOfsSavingUse(short s) { - // XXX kill this func - setKeyDataOfs(s); - } - - /** - * Unused keys are not returned by read operations. Keys may be marked - * as unused in cases where it is difficult to delete them while - * maintaining the constraints required of a btree. - * - * Setting ofs to odd is the sentinel for unused, as real recordLoc's - * are always even numbers. Note we need to keep its value basically - * the same as we use the recordLoc as part of the key in the index - * (to handle duplicate keys efficiently). - * - * Flagging keys as unused is a feature that is being phased out in favor - * of deleting the keys outright. The current btree implementation is - * not expected to mark a key as unused in a non legacy btree. - */ - void setUnused() { - recordLoc.GETOFS() |= 1; - } - - void setUsed() { - recordLoc.GETOFS() &= ~1; - } - - int isUnused() const { - return recordLoc.getOfs() & 1; - } - - int isUsed() const { - return !isUnused(); - } -}; - -/** - * This structure represents header data for a btree bucket. An object of - * this type is typically allocated inside of a buffer of size BucketSize, - * resulting in a full bucket with an appropriate header. - * - * The body of a btree bucket contains an array of _KeyNode objects starting - * from its lowest indexed bytes and growing to higher indexed bytes. The - * body also contains variable width bson keys, which are allocated from the - * highest indexed bytes toward lower indexed bytes. - * - * |hhhh|kkkkkkk--------bbbbbbbbbbbuuubbbuubbb| - * h = header data - * k = KeyNode data - * - = empty space - * b = bson key data - * u = unused (old) bson key data, that may be garbage collected - */ -struct BtreeBucketV0 { - /** - * Parent bucket of this bucket, which isNull() for the root bucket. - */ - DiskLoc parent; - - /** - * Given that there are n keys, this is the n index child. - */ - DiskLoc nextChild; - - /** - * Can be reused, value is 8192 in current pdfile version Apr2010 - */ - unsigned short _wasSize; - - /** - * zero - */ - unsigned short _reserved1; - - int flags; - - /** basicInsert() assumes the next three members are consecutive and in this order: */ - - /** Size of the empty region. */ - int emptySize; - - /** Size used for bson storage, including storage of old keys. */ - int topSize; - - /* Number of keys in the bucket. */ - int n; - - int reserved; - - /* Beginning of the bucket's body */ - char data[4]; - - // Precalculated size constants - enum { HeaderSize = 40 }; -}; - -// BtreeBucketV0 is part of the on-disk format, so it should never be changed -MONGO_STATIC_ASSERT(sizeof(BtreeBucketV0) - sizeof(static_cast<BtreeBucketV0*>(NULL)->data) == - BtreeBucketV0::HeaderSize); - -/** - * A variant of DiskLoc Used by the V1 bucket type. - */ -struct DiskLoc56Bit { - // - // Data - // - - int ofs; - - unsigned char _a[3]; - - // - // Accessors XXX rename these, this is terrible - // - - int& GETOFS() { - return ofs; - } - - int getOfs() const { - return ofs; - } - - // - // Comparison - // - - bool isNull() const { - return ofs < 0; - } - - unsigned long long toLongLong() const { - // endian - unsigned long long result = ofs; - char* cursor = reinterpret_cast<char*>(&result); - *reinterpret_cast<uint16_t*>(cursor + 4) = *reinterpret_cast<const uint16_t*>(&_a[0]); - *reinterpret_cast<uint8_t*>(cursor + 6) = *reinterpret_cast<const uint8_t*>(&_a[2]); - *reinterpret_cast<uint8_t*>(cursor + 7) = uint8_t(0); - return result; - } - - bool operator<(const DiskLoc56Bit& rhs) const { - // the orderering of dup keys in btrees isn't too critical, but we'd like to put items - // that are close together on disk close together in the tree, so we do want the file # - // to be the most significant bytes - return toLongLong() < rhs.toLongLong(); - } - - int compare(const DiskLoc56Bit& rhs) const { - unsigned long long a = toLongLong(); - unsigned long long b = rhs.toLongLong(); - if (a < b) { - return -1; - } else { - return a == b ? 0 : 1; - } - } - - bool operator==(const DiskLoc56Bit& rhs) const { - return toLongLong() == rhs.toLongLong(); - } - - bool operator!=(const DiskLoc56Bit& rhs) const { - return toLongLong() != rhs.toLongLong(); - } - - bool operator==(const DiskLoc& rhs) const { - return DiskLoc(*this) == rhs; - } - - bool operator!=(const DiskLoc& rhs) const { - return !(*this == rhs); - } - - // - // Mutation - // - - enum { - OurNullOfs = -2, // first bit of offsets used in _KeyNode we don't use -1 here - OurMaxA = 0xffffff, // highest 3-byte value - }; - - void Null() { - ofs = OurNullOfs; - _a[0] = _a[1] = _a[2] = 0; - } - - void operator=(const DiskLoc& loc); - - // - // Type Conversion - // - - RecordId toRecordId() const { - return DiskLoc(*this).toRecordId(); - } - - operator DiskLoc() const { - // endian - if (isNull()) - return DiskLoc(); - unsigned a = *((unsigned*)(_a - 1)); - return DiskLoc(a >> 8, ofs); - } - - std::string toString() const { - return DiskLoc(*this).toString(); - } -}; - -struct BtreeBucketV1 { - /** Parent bucket of this bucket, which isNull() for the root bucket. */ - DiskLoc56Bit parent; - - /** Given that there are n keys, this is the n index child. */ - DiskLoc56Bit nextChild; - - unsigned short flags; - - /** Size of the empty region. */ - unsigned short emptySize; - - /** Size used for bson storage, including storage of old keys. */ - unsigned short topSize; - - /* Number of keys in the bucket. */ - unsigned short n; - - /* Beginning of the bucket's body */ - char data[4]; - - // Precalculated size constants - enum { HeaderSize = 22 }; -}; - -// BtreeBucketV1 is part of the on-disk format, so it should never be changed -MONGO_STATIC_ASSERT(sizeof(BtreeBucketV1) - sizeof(static_cast<BtreeBucketV1*>(NULL)->data) == - BtreeBucketV1::HeaderSize); - -enum Flags { Packed = 1 }; - -struct BtreeLayoutV0 { - typedef FixedWidthKey<DiskLoc> FixedWidthKeyType; - typedef DiskLoc LocType; - typedef KeyBson KeyType; - typedef KeyBson KeyOwnedType; - typedef BtreeBucketV0 BucketType; - - enum { BucketSize = 8192, BucketBodySize = BucketSize - BucketType::HeaderSize }; - - // largest key size we allow. note we very much need to support bigger keys (somehow) in - // the future. - - static const int KeyMax = OldBucketSize / 10; - - // A sentinel value sometimes used to identify a deallocated bucket. - static const int INVALID_N_SENTINEL = -1; - - static void initBucket(BucketType* bucket) { - bucket->_reserved1 = 0; - bucket->_wasSize = BucketSize; - bucket->reserved = 0; - } -}; - -struct BtreeLayoutV1 { - typedef FixedWidthKey<DiskLoc56Bit> FixedWidthKeyType; - typedef KeyV1 KeyType; - typedef KeyV1Owned KeyOwnedType; - typedef DiskLoc56Bit LocType; - typedef BtreeBucketV1 BucketType; - - enum { - BucketSize = 8192 - 16, // The -16 is to leave room for the MmapV1RecordHeader header - BucketBodySize = BucketSize - BucketType::HeaderSize - }; - - static const int KeyMax = 1024; - - // A sentinel value sometimes used to identify a deallocated bucket. - static const unsigned short INVALID_N_SENTINEL = 0xffff; - - static void initBucket(BucketType* bucket) {} -}; - -#pragma pack() - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp deleted file mode 100644 index 6e5bce9b553..00000000000 --- a/src/mongo/db/storage/mmap_v1/btree/btree_test_help.cpp +++ /dev/null @@ -1,244 +0,0 @@ -// btree_test_help.cpp : Helper functions for Btree unit-testing -// - -/** - * Copyright (C) 2014 MongoDB - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects - * for all of the code used other than as permitted herein. If you modify - * file(s) with this exception, you may extend this exception to your - * version of the file(s), but you are not obligated to do so. If you do not - * wish to do so, delete this exception statement from your version. If you - * delete this exception statement from all source files in the program, - * then also delete it in the license file. - */ - -#include "mongo/db/storage/mmap_v1/btree/btree_test_help.h" - -#include "mongo/db/operation_context_noop.h" -#include "mongo/unittest/unittest.h" - - -namespace mongo { - -using std::string; - -string bigNumString(long long n, int len) { - char sub[17]; - sprintf(sub, "%.16llx", n); - string val(len, ' '); - for (int i = 0; i < len; ++i) { - val[i] = sub[i % 16]; - } - return val; -} - -BSONObj simpleKey(char c, int n) { - BSONObjBuilder builder; - string val(n, c); - builder.append("a", val); - return builder.obj(); -} - -// -// BtreeLogicTestHelper -// - -template <class OnDiskFormat> -BtreeLogicTestHelper<OnDiskFormat>::BtreeLogicTestHelper(const BSONObj& order) - : recordStore("TestRecordStore"), - btree(&headManager, - &recordStore, - &cursorRegistry, - Ordering::make(order), - "TestIndex", - /*isUnique*/ false) { - static const string randomData("RandomStuff"); - - // Generate a valid record location for a "fake" record, which we will repeatedly use - // thoughout the tests. - OperationContextNoop opCtx; - StatusWith<RecordId> s = recordStore.insertRecord( - &opCtx, randomData.c_str(), randomData.length(), Timestamp(), false); - - ASSERT_TRUE(s.isOK()); - ASSERT_EQUALS(1, recordStore.numRecords(NULL)); - - dummyDiskLoc = DiskLoc::fromRecordId(s.getValue()); -} - - -// -// ArtificialTreeBuilder -// - -template <class OnDiskFormat> -void ArtificialTreeBuilder<OnDiskFormat>::makeTree(const string& spec) { - _helper->headManager.setHead(_opCtx, makeTree(fromjson(spec)).toRecordId()); -} - -template <class OnDiskFormat> -DiskLoc ArtificialTreeBuilder<OnDiskFormat>::makeTree(const BSONObj& spec) { - DiskLoc bucketLoc = _helper->btree._addBucket(_opCtx); - BucketType* bucket = _helper->btree.getBucket(_opCtx, bucketLoc); - - BSONObjIterator i(spec); - while (i.more()) { - BSONElement e = i.next(); - DiskLoc child; - if (e.type() == Object) { - child = makeTree(e.embeddedObject()); - } - - if (e.fieldName() == string("_")) { - bucket->nextChild = child; - } else { - KeyDataOwnedType key(BSON("" << expectedKey(e.fieldName()))); - invariant(_helper->btree.pushBack(bucket, _helper->dummyDiskLoc, key, child)); - } - } - - _helper->btree.fixParentPtrs(_opCtx, bucket, bucketLoc); - return bucketLoc; -} - -template <class OnDiskFormat> -void ArtificialTreeBuilder<OnDiskFormat>::checkStructure(const string& spec) const { - checkStructure(fromjson(spec), DiskLoc::fromRecordId(_helper->headManager.getHead(_opCtx))); -} - -template <class OnDiskFormat> -void ArtificialTreeBuilder<OnDiskFormat>::push(const DiskLoc bucketLoc, - const BSONObj& key, - const DiskLoc child) { - KeyDataOwnedType k(key); - BucketType* bucket = _helper->btree.getBucket(_opCtx, bucketLoc); - - invariant(_helper->btree.pushBack(bucket, _helper->dummyDiskLoc, k, child)); - _helper->btree.fixParentPtrs(_opCtx, bucket, bucketLoc); -} - -template <class OnDiskFormat> -void ArtificialTreeBuilder<OnDiskFormat>::checkStructure(const BSONObj& spec, - const DiskLoc node) const { - BucketType* bucket = _helper->btree.getBucket(_opCtx, node); - - BSONObjIterator j(spec); - for (int i = 0; i < bucket->n; ++i) { - ASSERT(j.more()); - BSONElement e = j.next(); - KeyHeaderType kn = BtreeLogic<OnDiskFormat>::getKeyHeader(bucket, i); - string expected = expectedKey(e.fieldName()); - ASSERT(isPresent(BSON("" << expected), 1)); - ASSERT(isPresent(BSON("" << expected), -1)); - - // ASSERT_EQUALS(expected, kn.key.toBson().firstElement().valuestr()); - if (kn.prevChildBucket.isNull()) { - ASSERT(e.type() == jstNULL); - } else { - ASSERT(e.type() == Object); - checkStructure(e.embeddedObject(), kn.prevChildBucket); - } - } - if (bucket->nextChild.isNull()) { - // maybe should allow '_' field with null value? - ASSERT(!j.more()); - } else { - BSONElement e = j.next(); - ASSERT_EQUALS(string("_"), e.fieldName()); - ASSERT(e.type() == Object); - checkStructure(e.embeddedObject(), bucket->nextChild); - } - ASSERT(!j.more()); -} - -template <class OnDiskFormat> -bool ArtificialTreeBuilder<OnDiskFormat>::isPresent(const BSONObj& key, int direction) const { - int pos; - DiskLoc loc; - OperationContextNoop opCtx; - return _helper->btree.locate(&opCtx, key, _helper->dummyDiskLoc, direction, &pos, &loc); -} - -// Static -template <class OnDiskFormat> -string ArtificialTreeBuilder<OnDiskFormat>::expectedKey(const char* spec) { - if (spec[0] != '$') { - return spec; - } - char* endPtr; - - // parsing a long long is a pain, so just allow shorter keys for now - unsigned long long num = strtol(spec + 1, &endPtr, 16); - int len = 800; - if (*endPtr == '$') { - len = strtol(endPtr + 1, 0, 16); - } - - return bigNumString(num, len); -} - -template <class OnDiskFormat> -int ArtificialTreeBuilder<OnDiskFormat>::fillBucketToExactSize(const DiskLoc bucketLoc, - int targetSize, - char startKey) { - ASSERT_FALSE(bucketLoc.isNull()); - - BucketType* bucket = _helper->btree.getBucket(_opCtx, bucketLoc); - ASSERT_EQUALS(0, bucket->n); - - static const int bigSize = KeyDataOwnedType(simpleKey('a', 801)).dataSize(); - - int size = 0; - int keyCount = 0; - while (size < targetSize) { - int space = targetSize - size; - int nextSize = space - sizeof(FixedWidthKeyType); - verify(nextSize > 0); - - BSONObj newKey; - if (nextSize >= bigSize) { - newKey = simpleKey(startKey++, 801); - } else { - newKey = simpleKey(startKey++, nextSize - (bigSize - 801)); - } - - push(bucketLoc, newKey, DiskLoc()); - - size += KeyDataOwnedType(newKey).dataSize() + sizeof(FixedWidthKeyType); - keyCount += 1; - } - - ASSERT_EQUALS(_helper->btree._packedDataSize(bucket, 0), targetSize); - - return keyCount; -} - -// -// This causes actual code to be generated for the usages of the templates in this file. -// - -// V0 format. -template struct BtreeLogicTestHelper<BtreeLayoutV0>; -template class ArtificialTreeBuilder<BtreeLayoutV0>; - -// V1 format. -template struct BtreeLogicTestHelper<BtreeLayoutV1>; -template class ArtificialTreeBuilder<BtreeLayoutV1>; -} diff --git a/src/mongo/db/storage/mmap_v1/btree/btree_test_help.h b/src/mongo/db/storage/mmap_v1/btree/btree_test_help.h deleted file mode 100644 index c5d48b48b3a..00000000000 --- a/src/mongo/db/storage/mmap_v1/btree/btree_test_help.h +++ /dev/null @@ -1,150 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include <string> - -#include "mongo/db/json.h" -#include "mongo/db/storage/mmap_v1/btree/btree_logic.h" -#include "mongo/db/storage/mmap_v1/heap_record_store_btree.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h" - -namespace mongo { - -/** - * Generates a string of the specified length containing repeated concatenation of the - * hexadecimal representation of the input value. - */ -std::string bigNumString(long long n, int len); - -/** - * Generates key on a field 'a', with the specified number of repetitions of the character. - */ -BSONObj simpleKey(char c, int n = 1); - -/** - * Simple head manager, which performs no validity checking or persistence. - */ -class TestHeadManager : public HeadManager { -public: - virtual const RecordId getHead(OperationContext* opCtx) const { - return _head; - } - - virtual void setHead(OperationContext* opCtx, const RecordId newHead) { - _head = newHead; - } - -private: - RecordId _head; -}; - - -/** - * This structure encapsulates a Btree and all the infrastructure needed by it (head manager, - * record store and a valid disk location to use by the tests). - */ -template <class OnDiskFormat> -struct BtreeLogicTestHelper { - BtreeLogicTestHelper(const BSONObj& order); - - // Everything needed for a fully-functional Btree logic - TestHeadManager headManager; - HeapRecordStoreBtree recordStore; - SavedCursorRegistry cursorRegistry; - BtreeLogic<OnDiskFormat> btree; - DiskLoc dummyDiskLoc; -}; - - -/** - * Tool to construct custom tree shapes for tests. - */ -template <class OnDiskFormat> -class ArtificialTreeBuilder { -public: - typedef typename BtreeLogic<OnDiskFormat>::BucketType BucketType; - typedef typename BtreeLogic<OnDiskFormat>::KeyDataOwnedType KeyDataOwnedType; - typedef typename BtreeLogic<OnDiskFormat>::KeyHeaderType KeyHeaderType; - - typedef typename OnDiskFormat::FixedWidthKeyType FixedWidthKeyType; - - /** - * The tree builder wraps around the passed-in helper and will invoke methods on it. It - * does not do any cleanup, so constructing multiple trees over the same helper will - * cause leaked records. - */ - ArtificialTreeBuilder(OperationContext* opCtx, BtreeLogicTestHelper<OnDiskFormat>* helper) - : _opCtx(opCtx), _helper(helper) {} - - /** - * Causes the specified tree shape to be built on the associated helper and the tree's - * root installed as the head. Uses a custom JSON-based language with the following - * syntax: - * - * Btree := BTreeBucket - * BtreeBucket := { Child_1_Key: <BtreeBucket | null>, - * Child_2_Key: <BtreeBucket | null>, - * ..., - * _: <BtreeBucket | null> } - * - * The _ key name specifies the content of the nextChild pointer. The value null means - * use a fixed disk loc. - */ - void makeTree(const std::string& spec); - - /** - * Validates that the structure of the Btree in the helper matches the specification. - */ - void checkStructure(const std::string& spec) const; - - /** - * Adds the following key to the bucket and fixes up the child pointers. - */ - void push(const DiskLoc bucketLoc, const BSONObj& key, const DiskLoc child); - - /** - * @return The number of keys inserted. - */ - int fillBucketToExactSize(const DiskLoc bucketLoc, int targetSize, char startKey); - -private: - DiskLoc makeTree(const BSONObj& spec); - - void checkStructure(const BSONObj& spec, const DiskLoc node) const; - - bool isPresent(const BSONObj& key, int direction) const; - - static std::string expectedKey(const char* spec); - - OperationContext* _opCtx; - BtreeLogicTestHelper<OnDiskFormat>* _helper; -}; - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/key.cpp b/src/mongo/db/storage/mmap_v1/btree/key.cpp deleted file mode 100644 index 0c5eacb1998..00000000000 --- a/src/mongo/db/storage/mmap_v1/btree/key.cpp +++ /dev/null @@ -1,734 +0,0 @@ -/** - * Copyright (C) 2011 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/db/storage/mmap_v1/btree/key.h" - -#include <cmath> - -#include "mongo/base/data_type_endian.h" -#include "mongo/base/data_view.h" -#include "mongo/bson/simple_bsonobj_comparator.h" -#include "mongo/bson/util/builder.h" -#include "mongo/util/log.h" -#include "mongo/util/startup_test.h" - - -namespace mongo { - -using std::endl; -using std::numeric_limits; -using std::min; - -extern const Ordering nullOrdering = Ordering::make(BSONObj()); - -// KeyBson is for V0 (version #0) indexes - -int oldCompare(const BSONObj& l, const BSONObj& r, const Ordering& o); - -// "old" = pre signed dates & such; i.e. btree V0 -/* must be same canon type when called */ -int oldCompareElementValues(const BSONElement& l, const BSONElement& r) { - dassert(l.canonicalType() == r.canonicalType()); - int f; - double x; - - switch (l.type()) { - case EOO: - case Undefined: // EOO and Undefined are same canonicalType - case jstNULL: - case MaxKey: - case MinKey: - return 0; - case Bool: - return *l.value() - *r.value(); - case bsonTimestamp: - case Date: { - const unsigned long long lULL = l.date().toULL(); - const unsigned long long rULL = r.date().toULL(); - // unsigned dates for old version - if (lULL < rULL) - return -1; - return lULL == rULL ? 0 : 1; - } - case NumberLong: - if (r.type() == NumberLong) { - long long L = l._numberLong(); - long long R = r._numberLong(); - if (L < R) - return -1; - if (L == R) - return 0; - return 1; - } - // else fall through - case NumberInt: - case NumberDouble: { - double left = l.number(); - double right = r.number(); - bool lNan = - !(left <= numeric_limits<double>::max() && left >= -numeric_limits<double>::max()); - bool rNan = !(right <= numeric_limits<double>::max() && - right >= -numeric_limits<double>::max()); - if (lNan) { - if (rNan) { - return 0; - } else { - return -1; - } - } else if (rNan) { - return 1; - } - x = left - right; - if (x < 0) - return -1; - return x == 0 ? 0 : 1; - } - case jstOID: - return memcmp(l.value(), r.value(), OID::kOIDSize); - case Code: - case Symbol: - case String: - // nulls not allowed in the middle of strings in the old version - return strcmp(l.valuestr(), r.valuestr()); - case Object: - case Array: - return oldCompare(l.embeddedObject(), r.embeddedObject(), nullOrdering); - case DBRef: { - int lsz = l.valuesize(); - int rsz = r.valuesize(); - if (lsz - rsz != 0) - return lsz - rsz; - return memcmp(l.value(), r.value(), lsz); - } - case BinData: { - int lsz = l.objsize(); // our bin data size in bytes, not including the subtype byte - int rsz = r.objsize(); - if (lsz - rsz != 0) - return lsz - rsz; - return memcmp(l.value() + 4, r.value() + 4, lsz + 1); - } - case RegEx: { - int c = strcmp(l.regex(), r.regex()); - if (c) - return c; - return strcmp(l.regexFlags(), r.regexFlags()); - } - case CodeWScope: { - f = l.canonicalType() - r.canonicalType(); - if (f) - return f; - f = strcmp(l.codeWScopeCode(), r.codeWScopeCode()); - if (f) - return f; - f = strcmp(l.codeWScopeScopeDataUnsafe(), r.codeWScopeScopeDataUnsafe()); - if (f) - return f; - return 0; - } - default: - log() << "oldCompareElementValues: bad type " << (int)l.type() << endl; - verify(false); - } - return -1; -} - -int oldElemCompare(const BSONElement& l, const BSONElement& r) { - int lt = (int)l.canonicalType(); - int rt = (int)r.canonicalType(); - int x = lt - rt; - if (x) - return x; - return oldCompareElementValues(l, r); -} - -// pre signed dates & such -int oldCompare(const BSONObj& l, const BSONObj& r, const Ordering& o) { - BSONObjIterator i(l); - BSONObjIterator j(r); - unsigned mask = 1; - while (1) { - // so far, equal... - - BSONElement l = i.next(); - BSONElement r = j.next(); - if (l.eoo()) - return r.eoo() ? 0 : -1; - if (r.eoo()) - return 1; - - int x; - { - x = oldElemCompare(l, r); - if (o.descending(mask)) - x = -x; - } - if (x != 0) - return x; - mask <<= 1; - } - return -1; -} - -/* old style compares: - - dates are unsigned - - strings no nulls -*/ -int KeyBson::woCompare(const KeyBson& r, const Ordering& o) const { - return oldCompare(_o, r._o, o); -} - -// woEqual could be made faster than woCompare but this is for backward compatibility so not worth a -// big effort -bool KeyBson::woEqual(const KeyBson& r) const { - return oldCompare(_o, r._o, nullOrdering) == 0; -} - -// [ ][HASMORE][x][y][canontype_4bits] -enum CanonicalsEtc { - cminkey = 1, - cnull = 2, - cdouble = 4, - cstring = 6, - cbindata = 7, - coid = 8, - cfalse = 10, - ctrue = 11, - cdate = 12, - cmaxkey = 14, - cCANONTYPEMASK = 0xf, - cY = 0x10, - cint = cY | cdouble, - cX = 0x20, - clong = cX | cdouble, - cHASMORE = 0x40, - cNOTUSED = 0x80 // but see IsBSON sentinel - this bit not usable without great care -}; - -// bindata bson type -const unsigned BinDataLenMask = 0xf0; // lengths are powers of 2 of this value -const unsigned BinDataTypeMask = - 0x0f; // 0-7 as you would expect, 8-15 are 128+value. see BinDataType. -const int BinDataLenMax = 32; -const int BinDataLengthToCode[] = { - 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, - 0x70, 0x80, -1 /*9*/, 0x90 /*10*/, -1 /*11*/, 0xa0 /*12*/, -1 /*13*/, - 0xb0 /*14*/, -1 /*15*/, 0xc0 /*16*/, -1, -1, -1, 0xd0 /*20*/, - -1, -1, -1, 0xe0 /*24*/, -1, -1, -1, - -1, -1, -1, -1, 0xf0 /*32*/ -}; -const int BinDataCodeToLength[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 32}; - -int binDataCodeToLength(int codeByte) { - return BinDataCodeToLength[codeByte >> 4]; -} - -/** object cannot be represented in compact format. so store in traditional bson format - with a leading sentinel byte IsBSON to indicate it's in that format. - - Given that the KeyV1Owned constructor already grabbed a bufbuilder, we reuse it here - so that we don't have to do an extra malloc. -*/ -void KeyV1Owned::traditional(const BSONObj& obj) { - b.reset(); - b.appendUChar(IsBSON); - b.appendBuf(obj.objdata(), obj.objsize()); - _keyData = (const unsigned char*)b.buf(); -} - -KeyV1Owned::KeyV1Owned(const KeyV1& rhs) { - b.appendBuf(rhs.data(), rhs.dataSize()); - _keyData = (const unsigned char*)b.buf(); - dassert(b.len() == dataSize()); // check datasize method is correct - dassert((*_keyData & cNOTUSED) == 0); -} - -// fromBSON to Key format -KeyV1Owned::KeyV1Owned(const BSONObj& obj) { - BSONObjIterator i(obj); - unsigned char bits = 0; - while (1) { - BSONElement e = i.next(); - if (i.more()) - bits |= cHASMORE; - switch (e.type()) { - case MinKey: - b.appendUChar(cminkey | bits); - break; - case jstNULL: - b.appendUChar(cnull | bits); - break; - case MaxKey: - b.appendUChar(cmaxkey | bits); - break; - case Bool: - b.appendUChar((e.boolean() ? ctrue : cfalse) | bits); - break; - case jstOID: - b.appendUChar(coid | bits); - b.appendBuf(e.__oid().view().view(), OID::kOIDSize); - break; - case BinData: { - int t = e.binDataType(); - // 0-7 and 0x80 to 0x87 are supported by Key - if ((t & 0x78) == 0 && t != ByteArrayDeprecated) { - int len; - const char* d = e.binData(len); - if (len <= BinDataLenMax) { - int code = BinDataLengthToCode[len]; - if (code >= 0) { - if (t >= 128) - t = (t - 128) | 0x08; - dassert((code & t) == 0); - b.appendUChar(cbindata | bits); - b.appendUChar(code | t); - b.appendBuf(d, len); - break; - } - } - } - traditional(obj); - return; - } - case Date: - b.appendUChar(cdate | bits); - b.appendNum(e.date().toMillisSinceEpoch()); - break; - case String: { - b.appendUChar(cstring | bits); - // note we do not store the terminating null, to save space. - unsigned x = (unsigned)e.valuestrsize() - 1; - if (x > 255) { - traditional(obj); - return; - } - b.appendUChar(x); - b.appendBuf(e.valuestr(), x); - break; - } - case NumberInt: - b.appendUChar(cint | bits); - b.appendNum((double)e._numberInt()); - break; - case NumberLong: { - long long n = e._numberLong(); - long long m = 2LL << 52; - DEV { - long long d = m - 1; - verify(((long long)((double)-d)) == -d); - } - if (n >= m || n <= -m) { - // can't represent exactly as a double - traditional(obj); - return; - } - b.appendUChar(clong | bits); - b.appendNum((double)n); - break; - } - case NumberDouble: { - double d = e._numberDouble(); - if (std::isnan(d)) { - traditional(obj); - return; - } - b.appendUChar(cdouble | bits); - b.appendNum(d); - break; - } - default: - // if other types involved, store as traditional BSON - traditional(obj); - return; - } - if (!i.more()) - break; - bits = 0; - } - _keyData = (const unsigned char*)b.buf(); - dassert(b.len() == dataSize()); // check datasize method is correct - dassert((*_keyData & cNOTUSED) == 0); -} - -BSONObj KeyV1::toBson() const { - verify(_keyData != 0); - if (!isCompactFormat()) - return bson(); - - BSONObjBuilder b(512); - const unsigned char* p = _keyData; - while (1) { - unsigned bits = *p++; - - switch (bits & 0x3f) { - case cminkey: - b.appendMinKey(""); - break; - case cnull: - b.appendNull(""); - break; - case cfalse: - b.appendBool("", false); - break; - case ctrue: - b.appendBool("", true); - break; - case cmaxkey: - b.appendMaxKey(""); - break; - case cstring: { - unsigned sz = *p++; - // we build the element ourself as we have to null terminate it - BufBuilder& bb = b.bb(); - bb.appendNum((char)String); - bb.appendUChar(0); // fieldname "" - bb.appendNum(sz + 1); - bb.appendBuf(p, sz); - bb.appendUChar(0); // null char at end of string - p += sz; - break; - } - case coid: { - OID oid = OID::from(p); - b.appendOID("", &oid); - p += OID::kOIDSize; - break; - } - case cbindata: { - int len = binDataCodeToLength(*p); - int subtype = (*p) & BinDataTypeMask; - if (subtype & 0x8) { - subtype = (subtype & 0x7) | 0x80; - } - b.appendBinData("", len, (BinDataType)subtype, ++p); - p += len; - break; - } - case cdate: - b.appendDate( - "", - Date_t::fromMillisSinceEpoch(ConstDataView(reinterpret_cast<const char*>(p)) - .read<LittleEndian<long long>>())); - p += 8; - break; - case cdouble: - b.append( - "", - ConstDataView(reinterpret_cast<const char*>(p)).read<LittleEndian<double>>()); - p += sizeof(double); - break; - case cint: - b.append("", - static_cast<int>(ConstDataView(reinterpret_cast<const char*>(p)) - .read<LittleEndian<double>>())); - p += sizeof(double); - break; - case clong: - b.append("", - static_cast<long long>(ConstDataView(reinterpret_cast<const char*>(p)) - .read<LittleEndian<double>>())); - p += sizeof(double); - break; - default: - verify(false); - } - - if ((bits & cHASMORE) == 0) - break; - } - return b.obj(); -} - -static int compare(const unsigned char*& l, const unsigned char*& r) { - int lt = (*l & cCANONTYPEMASK); - int rt = (*r & cCANONTYPEMASK); - int x = lt - rt; - if (x) - return x; - - l++; - r++; - - // same type - switch (lt) { - case cdouble: { - double L = ConstDataView(reinterpret_cast<const char*>(l)).read<LittleEndian<double>>(); - double R = ConstDataView(reinterpret_cast<const char*>(r)).read<LittleEndian<double>>(); - if (L < R) - return -1; - if (L != R) - return 1; - l += 8; - r += 8; - break; - } - case cstring: { - int lsz = *l; - int rsz = *r; - int common = min(lsz, rsz); - l++; - r++; // skip the size byte - // use memcmp as we (will) allow zeros in UTF8 strings - int res = memcmp(l, r, common); - if (res) - return res; - // longer string is the greater one - int diff = lsz - rsz; - if (diff) - return diff; - l += lsz; - r += lsz; - break; - } - case cbindata: { - int L = *l; - int R = *r; - int llen = binDataCodeToLength(L); - int diff = L - R; // checks length and subtype simultaneously - if (diff) { - // unfortunately nibbles are backwards to do subtype and len in one check (could bit - // swap...) - int rlen = binDataCodeToLength(R); - if (llen != rlen) - return llen - rlen; - return diff; - } - // same length, same type - l++; - r++; - int res = memcmp(l, r, llen); - if (res) - return res; - l += llen; - r += llen; - break; - } - case cdate: { - long long L = - ConstDataView(reinterpret_cast<const char*>(l)).read<LittleEndian<long long>>(); - long long R = - ConstDataView(reinterpret_cast<const char*>(r)).read<LittleEndian<long long>>(); - if (L < R) - return -1; - if (L > R) - return 1; - l += 8; - r += 8; - break; - } - case coid: { - int res = memcmp(l, r, OID::kOIDSize); - if (res) - return res; - l += OID::kOIDSize; - r += OID::kOIDSize; - break; - } - default: - // all the others are a match -- e.g. null == null - ; - } - - return 0; -} - -// at least one of this and right are traditional BSON format -int NOINLINE_DECL KeyV1::compareHybrid(const KeyV1& right, const Ordering& order) const { - BSONObj L = toBson(); - BSONObj R = right.toBson(); - return L.woCompare(R, order, /*considerfieldname*/ false); -} - -int KeyV1::woCompare(const KeyV1& right, const Ordering& order) const { - const unsigned char* l = _keyData; - const unsigned char* r = right._keyData; - - if ((*l | *r) == IsBSON) // only can do this if cNOTUSED maintained - return compareHybrid(right, order); - - unsigned mask = 1; - while (1) { - char lval = *l; - char rval = *r; - { - int x = compare(l, r); // updates l and r pointers - if (x) { - if (order.descending(mask)) - x = -x; - return x; - } - } - - { - int x = ((int)(lval & cHASMORE)) - ((int)(rval & cHASMORE)); - if (x) - return x; - if ((lval & cHASMORE) == 0) - break; - } - - mask <<= 1; - } - - return 0; -} - -static unsigned sizes[] = {0, - 1, // cminkey=1, - 1, // cnull=2, - 0, - 9, // cdouble=4, - 0, - 0, // cstring=6, - 0, - 13, // coid=8, - 0, - 1, // cfalse=10, - 1, // ctrue=11, - 9, // cdate=12, - 0, - 1, // cmaxkey=14, - 0}; - -inline unsigned sizeOfElement(const unsigned char* p) { - unsigned type = *p & cCANONTYPEMASK; - unsigned sz = sizes[type]; - if (sz == 0) { - if (type == cstring) { - sz = ((unsigned)p[1]) + 2; - } else { - verify(type == cbindata); - sz = binDataCodeToLength(p[1]) + 2; - } - } - return sz; -} - -int KeyV1::dataSize() const { - const unsigned char* p = _keyData; - if (!isCompactFormat()) { - return bson().objsize() + 1; - } - - bool more; - do { - unsigned z = sizeOfElement(p); - more = (*p & cHASMORE) != 0; - p += z; - } while (more); - return p - _keyData; -} - -bool KeyV1::woEqual(const KeyV1& right) const { - const unsigned char* l = _keyData; - const unsigned char* r = right._keyData; - - if ((*l | *r) == IsBSON) { - return SimpleBSONObjComparator::kInstance.evaluate(toBson() == right.toBson()); - } - - while (1) { - char lval = *l; - char rval = *r; - if ((lval & (cCANONTYPEMASK | cHASMORE)) != (rval & (cCANONTYPEMASK | cHASMORE))) - return false; - l++; - r++; - switch (lval & cCANONTYPEMASK) { - case coid: - if (ConstDataView(reinterpret_cast<const char*>(l)) - .read<LittleEndian<uint32_t>>() != - ConstDataView(reinterpret_cast<const char*>(r)).read<LittleEndian<uint32_t>>()) - return false; - l += 4; - r += 4; - case cdate: - if (ConstDataView(reinterpret_cast<const char*>(l)) - .read<LittleEndian<unsigned long long>>() != - ConstDataView(reinterpret_cast<const char*>(r)) - .read<LittleEndian<unsigned long long>>()) - return false; - l += 8; - r += 8; - break; - case cdouble: - if (ConstDataView(reinterpret_cast<const char*>(l)).read<LittleEndian<double>>() != - ConstDataView(reinterpret_cast<const char*>(r)).read<LittleEndian<double>>()) - return false; - l += 8; - r += 8; - break; - case cstring: { - if (*l != *r) - return false; // not same length - unsigned sz = ((unsigned)*l) + 1; - if (memcmp(l, r, sz)) - return false; - l += sz; - r += sz; - break; - } - case cbindata: { - if (*l != *r) - return false; // len or subtype mismatch - int len = binDataCodeToLength(*l) + 1; - if (memcmp(l, r, len)) - return false; - l += len; - r += len; - break; - } - case cminkey: - case cnull: - case cfalse: - case ctrue: - case cmaxkey: - break; - default: - verify(false); - } - if ((lval & cHASMORE) == 0) - break; - } - return true; -} - -struct CmpUnitTest : public StartupTest { - void run() { - char a[2]; - char b[2]; - a[0] = -3; - a[1] = 0; - b[0] = 3; - b[1] = 0; - verify(strcmp(a, b) > 0 && memcmp(a, b, 2) > 0); - } -} cunittest; - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/btree/key.h b/src/mongo/db/storage/mmap_v1/btree/key.h deleted file mode 100644 index 906ddcc621b..00000000000 --- a/src/mongo/db/storage/mmap_v1/btree/key.h +++ /dev/null @@ -1,167 +0,0 @@ -// @file key.h class(es) representing individual keys in a btree - -/** -* Copyright (C) 2011 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include "mongo/db/jsobj.h" -#include "mongo/util/debug_util.h" - -namespace mongo { - -/** Key class for precomputing a small format index key that is denser than a traditional BSONObj. - - KeyBson is a legacy wrapper implementation for old BSONObj style keys for v:0 indexes. - - KeyV1 is the new implementation. -*/ -class KeyBson /* "KeyV0" */ { -public: - KeyBson() {} - explicit KeyBson(const char* keyData) : _o(keyData) {} - explicit KeyBson(const BSONObj& obj) : _o(obj) {} - int woCompare(const KeyBson& r, const Ordering& o) const; - BSONObj toBson() const { - return _o; - } - std::string toString() const { - return _o.toString(); - } - int dataSize() const { - return _o.objsize(); - } - const char* data() const { - return _o.objdata(); - } - BSONElement _firstElement() const { - return _o.firstElement(); - } - bool isCompactFormat() const { - return false; - } - bool woEqual(const KeyBson& r) const; - void assign(const KeyBson& rhs) { - *this = rhs; - } - bool isValid() const { - return true; - } - -private: - BSONObj _o; -}; - -class KeyV1Owned; - -// corresponding to BtreeData_V1 -class KeyV1 { - // disallowed just to make people be careful as we don't own the buffer - void operator=(const KeyV1&); - // disallowed as this is not a great idea as KeyV1Owned likely will go out of scope - KeyV1(const KeyV1Owned&); - -public: - KeyV1() { - _keyData = 0; - } - ~KeyV1() { - DEV _keyData = (const unsigned char*)1; - } - - KeyV1(const KeyV1& rhs) : _keyData(rhs._keyData) { - dassert(_keyData > (const unsigned char*)1); - } - - // explicit version of operator= to be safe - void assign(const KeyV1& rhs) { - _keyData = rhs._keyData; - } - - /** @param keyData can be a buffer containing data in either BSON format, OR in KeyV1 format. - when BSON, we are just a wrapper - */ - explicit KeyV1(const char* keyData) : _keyData((unsigned char*)keyData) {} - - int woCompare(const KeyV1& r, const Ordering& o) const; - bool woEqual(const KeyV1& r) const; - BSONObj toBson() const; - std::string toString() const { - return toBson().toString(); - } - - /** get the key data we want to store in the btree bucket */ - const char* data() const { - return (const char*)_keyData; - } - - /** @return size of data() */ - int dataSize() const; - - /** only used by geo, which always has bson keys */ - BSONElement _firstElement() const { - return bson().firstElement(); - } - bool isCompactFormat() const { - return *_keyData != IsBSON; - } - - bool isValid() const { - return _keyData > (const unsigned char*)1; - } - -protected: - enum { IsBSON = 0xff }; - const unsigned char* _keyData; - BSONObj bson() const { - dassert(!isCompactFormat()); - return BSONObj((const char*)_keyData + 1); - } - -private: - int compareHybrid(const KeyV1& right, const Ordering& order) const; -}; - -class KeyV1Owned : public KeyV1 { - void operator=(const KeyV1Owned&); - -public: - /** @obj a BSON object to be translated to KeyV1 format. If the object isn't - representable in KeyV1 format (which happens, intentionally, at times) - it will stay as bson herein. - */ - KeyV1Owned(const BSONObj& obj); - - /** makes a copy (memcpy's the whole thing) */ - KeyV1Owned(const KeyV1& rhs); - -private: - StackBufBuilder b; - void traditional(const BSONObj& obj); // store as traditional bson not as compact format -}; -}; diff --git a/src/mongo/db/storage/mmap_v1/catalog/hashtab.cpp b/src/mongo/db/storage/mmap_v1/catalog/hashtab.cpp deleted file mode 100644 index df766917fac..00000000000 --- a/src/mongo/db/storage/mmap_v1/catalog/hashtab.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/** - * Copyright (C) 2009-2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kIndex - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/catalog/hashtab.h" - -#include "mongo/util/log.h" - -namespace mongo { - -int NamespaceHashTable::_find(const Namespace& k, bool& found) const { - found = false; - int h = k.hash(); - int i = h % n; - int start = i; - int chain = 0; - int firstNonUsed = -1; - while (1) { - if (!_nodes(i).inUse()) { - if (firstNonUsed < 0) - firstNonUsed = i; - } - - if (_nodes(i).hash == h && _nodes(i).key == k) { - if (chain >= 200) - log() << "warning: hashtable " << _name << " long chain " << std::endl; - found = true; - return i; - } - chain++; - i = (i + 1) % n; - if (i == start) { - // shouldn't get here / defensive for infinite loops - log() << "error: hashtable " << _name << " is full n:" << n << std::endl; - return -1; - } - if (chain >= maxChain) { - if (firstNonUsed >= 0) - return firstNonUsed; - log() << "error: hashtable " << _name << " max chain reached:" << maxChain << std::endl; - return -1; - } - } -} - -/* buf must be all zeroes on initialization. */ -NamespaceHashTable::NamespaceHashTable(void* buf, int buflen, const char* name) - : _name(name), _buf(buf) { - n = buflen / sizeof(Node); - if ((n & 1) == 0) { - n--; - } - - maxChain = (int)(n * 0.05); -} - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/catalog/hashtab.h b/src/mongo/db/storage/mmap_v1/catalog/hashtab.h deleted file mode 100644 index f873e6a4d3a..00000000000 --- a/src/mongo/db/storage/mmap_v1/catalog/hashtab.h +++ /dev/null @@ -1,138 +0,0 @@ -/** - * Copyright (C) 2009-2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/base/static_assert.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h" -#include "mongo/stdx/functional.h" - -namespace mongo { - -/** - * Simple, fixed size hash table used for namespace mapping (effectively the contents of the - * MMAP V1 .ns file). Uses a contiguous block of memory, so you can put it in a memory mapped - * file very easily. - */ -class NamespaceHashTable { - MONGO_DISALLOW_COPYING(NamespaceHashTable); - -public: - typedef stdx::function<void(const Namespace& k, NamespaceDetails& v)> IteratorCallback; - - - /* buf must be all zeroes on initialization. */ - NamespaceHashTable(void* buf, int buflen, const char* name); - - NamespaceDetails* get(const Namespace& k) const { - bool found; - int i = _find(k, found); - if (found) { - return &_nodes(i).value; - } - - return 0; - } - - void kill(OperationContext* opCtx, const Namespace& k) { - bool found; - int i = _find(k, found); - if (i >= 0 && found) { - Node* n = &_nodes(i); - n = opCtx->recoveryUnit()->writing(n); - n->key.kill(); - n->setUnused(); - } - } - - /** returns false if too full */ - bool put(OperationContext* opCtx, const Namespace& k, const NamespaceDetails& value) { - bool found; - int i = _find(k, found); - if (i < 0) - return false; - - Node* n = opCtx->recoveryUnit()->writing(&_nodes(i)); - if (!found) { - n->key = k; - n->hash = k.hash(); - } else { - invariant(n->hash == k.hash()); - } - - n->value = value; - return true; - } - - void iterAll(IteratorCallback callback) { - for (int i = 0; i < n; i++) { - if (_nodes(i).inUse()) { - callback(_nodes(i).key, _nodes(i).value); - } - } - } - - -private: -#pragma pack(1) - struct Node { - int hash; - Namespace key; - NamespaceDetails value; - - bool inUse() const { - return hash != 0; - } - - void setUnused() { - hash = 0; - } - }; -#pragma pack() - - MONGO_STATIC_ASSERT(sizeof(Node) == 628); - - - int _find(const Namespace& k, bool& found) const; - - Node& _nodes(int i) const { - Node* nodes = (Node*)_buf; - return nodes[i]; - } - - - const char* _name; - void* const _buf; - - int n; // number of hashtable buckets - int maxChain; -}; - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/catalog/index_details.cpp b/src/mongo/db/storage/mmap_v1/catalog/index_details.cpp deleted file mode 100644 index fa9093196f8..00000000000 --- a/src/mongo/db/storage/mmap_v1/catalog/index_details.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// index_details.cpp - -/** -* Copyright (C) 2008 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#include "mongo/db/storage/mmap_v1/catalog/index_details.h" - -namespace mongo { - -void IndexDetails::_reset() { - head.setInvalid(); - info.setInvalid(); -} -} diff --git a/src/mongo/db/storage/mmap_v1/catalog/index_details.h b/src/mongo/db/storage/mmap_v1/catalog/index_details.h deleted file mode 100644 index 1ee5387c57c..00000000000 --- a/src/mongo/db/storage/mmap_v1/catalog/index_details.h +++ /dev/null @@ -1,70 +0,0 @@ -// index_details.h - -/** -* Copyright (C) 2008 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include "mongo/db/storage/mmap_v1/diskloc.h" - -namespace mongo { - -/* Details about a particular index. There is one of these effectively for each object in - system.namespaces (although this also includes the head pointer, which is not in that - collection). - - This is an internal part of the catalog. Nothing outside of the catalog should use this. - - ** MemoryMapped in NamespaceDetails ** (i.e., this is on disk data) - */ -#pragma pack(1) -struct IndexDetails { - /** - * btree head disk location - */ - DiskLoc head; - - /* Location of index info object. Format: - - { name:"nameofindex", ns:"parentnsname", key: {keypattobject} - [, unique: <bool>, background: <bool>, v:<version>] - } - - This object is in the system.indexes collection. Note that since we - have a pointer to the object here, the object in system.indexes MUST NEVER MOVE. - */ - DiskLoc info; - - /** - * makes head and info invalid - */ - void _reset(); -}; -#pragma pack() - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace.h b/src/mongo/db/storage/mmap_v1/catalog/namespace.h deleted file mode 100644 index e7f2ba636a5..00000000000 --- a/src/mongo/db/storage/mmap_v1/catalog/namespace.h +++ /dev/null @@ -1,168 +0,0 @@ -/** - * Copyright (C) 2017 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include <boost/filesystem/path.hpp> -#include <cstring> -#include <string> - -#include "mongo/base/string_data.h" -#include "mongo/db/namespace_string.h" -#include "mongo/util/assert_util.h" -#include "mongo/util/mongoutils/str.h" - -namespace mongo { - -#pragma pack(1) -/** - * This is used for storing a namespace on disk in a fixed witdh form and should only be used for - * that, not for passing internally. - * - * If you need to pass internally, please use NamespaceString. - */ -class Namespace { -public: - Namespace(StringData ns) { - *this = ns; - } - - Namespace& operator=(StringData ns) { - // We fill the remaining space with all zeroes here. As the full Namespace struct is in the - // datafiles (the .ns files specifically), that is helpful as then they are deterministic in - // the bytes they have for a given sequence of operations. This makes testing and debugging - // the data files easier. - // - // If profiling indicates this method is a significant bottleneck, we could have a version - // we use for reads which does not fill with zeroes, and keep the zeroing behavior on - // writes. - memset(buf, 0, sizeof(buf)); - uassert(10080, - str::stream() << "ns name " << ns << " (size: " << ns.size() - << ") too long, max size is 127 bytes", - ns.size() <= MaxNsLen); - uassert( - 17380, "ns name can't contain embedded '\0' byte", ns.find('\0') == std::string::npos); - ns.copyTo(buf, true); - return *this; - } - - void kill() { - buf[0] = 0x7f; - } - - bool operator==(const char* r) const { - return strcmp(buf, r) == 0; - } - bool operator==(const Namespace& r) const { - return strcmp(buf, r.buf) == 0; - } - bool operator!=(const char* r) const { - return strcmp(buf, r) != 0; - } - bool operator!=(const Namespace& r) const { - return strcmp(buf, r.buf) != 0; - } - - bool hasDollarSign() const { - return strchr(buf, '$') != NULL; - } - - /** - * Value returned is always > 0 - */ - int hash() const { - unsigned x = 0; - const char* p = buf; - while (*p) { - x = x * 131 + *p; - p++; - } - return (x & 0x7fffffff) | 0x8000000; // must be > 0 - } - - size_t size() const { - return strlen(buf); - } - - std::string toString() const { - return buf; - } - operator std::string() const { - return buf; - } - - /** - * NamespaceDetails::Extra was added after fact to allow chaining of data blocks to support more - * than 10 indexes (more than 10 IndexDetails). It's a bit hacky because of this late addition - * with backward file support. - */ - std::string extraName(int i) const { - char ex[] = "$extra"; - ex[5] += i; - const std::string s = std::string(buf) + ex; - massert(10348, "$extra: ns name too long", s.size() <= MaxNsLen); - return s; - } - - /** - * Returns whether the namespace ends with "$extr...". When true it represents an extra block - * not a normal NamespaceDetails block. - */ - bool isExtra() const { - const char* p = strstr(buf, "$extr"); - return p && p[5] && - p[6] == 0; //== 0 is important in case an index uses name "$extra_1" for example - } - - enum MaxNsLenValue { - // Maximum possible length of name any namespace, including special ones like $extra. This - // includes room for the NUL byte so it can be used when sizing buffers. - MaxNsLenWithNUL = 128, - - // MaxNsLenWithNUL excluding the NUL byte. Use this when comparing std::string lengths. - MaxNsLen = MaxNsLenWithNUL - 1, - - // Maximum allowed length of fully qualified namespace name of any real collection. Does not - // include NUL so it can be directly compared to std::string lengths. - MaxNsCollectionLen = MaxNsLen - 7 /*strlen(".$extra")*/, - }; - -private: - char buf[MaxNsLenWithNUL]; -}; -#pragma pack() - -namespace { -MONGO_STATIC_ASSERT(sizeof(Namespace) == 128); -MONGO_STATIC_ASSERT(Namespace::MaxNsLenWithNUL == MaxDatabaseNameLen); -MONGO_STATIC_ASSERT((int)Namespace::MaxNsLenWithNUL == (int)NamespaceString::MaxNsLenWithNUL); -MONGO_STATIC_ASSERT((int)Namespace::MaxNsLen == (int)NamespaceString::MaxNsLen); -MONGO_STATIC_ASSERT((int)Namespace::MaxNsCollectionLen == (int)NamespaceString::MaxNsCollectionLen); -} // namespace -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp deleted file mode 100644 index df75d2ba8aa..00000000000 --- a/src/mongo/db/storage/mmap_v1/catalog/namespace_details.cpp +++ /dev/null @@ -1,242 +0,0 @@ -/** -* Copyright (C) 2008 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h" - -#include <algorithm> -#include <list> - -#include "mongo/base/counter.h" -#include "mongo/db/catalog/collection.h" -#include "mongo/db/catalog/collection_options.h" -#include "mongo/db/clientcursor.h" -#include "mongo/db/commands/server_status.h" -#include "mongo/db/concurrency/locker.h" -#include "mongo/db/db.h" -#include "mongo/db/index_legacy.h" -#include "mongo/db/json.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/ops/delete.h" -#include "mongo/db/ops/update.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace_index.h" -#include "mongo/scripting/engine.h" -#include "mongo/util/startup_test.h" - -namespace mongo { - -NamespaceDetails::NamespaceDetails(const DiskLoc& loc, bool capped) { - MONGO_STATIC_ASSERT(sizeof(NamespaceDetails::Extra) <= sizeof(NamespaceDetails)); - - /* be sure to initialize new fields here -- doesn't default to zeroes the way we use it */ - firstExtent = lastExtent = capExtent = loc; - stats.datasize = stats.nrecords = 0; - lastExtentSize = 0; - nIndexes = 0; - isCapped = capped; - maxDocsInCapped = 0x7fffffff; // no limit (value is for pre-v2.3.2 compatibility) - paddingFactorOldDoNotUse = 1.0; - systemFlagsOldDoNotUse = 0; - userFlags = 0; - capFirstNewRecord = DiskLoc(); - // Signal that we are on first allocation iteration through extents. - capFirstNewRecord.setInvalid(); - // For capped case, signal that we are doing initial extent allocation. - if (capped) { - // WAS: cappedLastDelRecLastExtent().setInvalid(); - deletedListSmall[1].setInvalid(); - } - verify(sizeof(_dataFileVersion) == 2); - _dataFileVersion = 0; - _indexFileVersion = 0; - multiKeyIndexBits = 0; - _reservedA = 0; - _extraOffset = 0; - indexBuildsInProgress = 0; - memset(_reserved, 0, sizeof(_reserved)); -} - -NamespaceDetails::Extra* NamespaceDetails::allocExtra(OperationContext* opCtx, - StringData ns, - NamespaceIndex& ni, - int nindexessofar) { - // Namespace details must always be changed under an exclusive DB lock - const NamespaceString nss(ns); - invariant(opCtx->lockState()->isDbLockedForMode(nss.db(), MODE_X)); - - int i = (nindexessofar - NIndexesBase) / NIndexesExtra; - verify(i >= 0 && i <= 1); - - Namespace fullns(ns); - Namespace extrans(fullns.extraName(i)); // throws AssertionException if ns name too long - - massert(10351, "allocExtra: extra already exists", ni.details(extrans) == 0); - - Extra temp; - temp.init(); - - ni.add_ns(opCtx, extrans, reinterpret_cast<NamespaceDetails*>(&temp)); - Extra* e = reinterpret_cast<NamespaceDetails::Extra*>(ni.details(extrans)); - - long ofs = e->ofsFrom(this); - if (i == 0) { - verify(_extraOffset == 0); - *opCtx->recoveryUnit()->writing(&_extraOffset) = ofs; - verify(extra() == e); - } else { - Extra* hd = extra(); - verify(hd->next(this) == 0); - hd->setNext(opCtx, ofs); - } - return e; -} - -IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected) { - if (idxNo < NIndexesBase) { - IndexDetails& id = _indexes[idxNo]; - return id; - } - Extra* e = extra(); - if (!e) { - if (missingExpected) - uasserted(13283, "Missing Extra"); - massert(14045, "missing Extra", e); - } - int i = idxNo - NIndexesBase; - if (i >= NIndexesExtra) { - e = e->next(this); - if (!e) { - if (missingExpected) - uasserted(14823, "missing extra"); - massert(14824, "missing Extra", e); - } - i -= NIndexesExtra; - } - return e->details[i]; -} - - -const IndexDetails& NamespaceDetails::idx(int idxNo, bool missingExpected) const { - if (idxNo < NIndexesBase) { - const IndexDetails& id = _indexes[idxNo]; - return id; - } - const Extra* e = extra(); - if (!e) { - if (missingExpected) - uasserted(17421, "Missing Extra"); - massert(17422, "missing Extra", e); - } - int i = idxNo - NIndexesBase; - if (i >= NIndexesExtra) { - e = e->next(this); - if (!e) { - if (missingExpected) - uasserted(17423, "missing extra"); - massert(17424, "missing Extra", e); - } - i -= NIndexesExtra; - } - return e->details[i]; -} - -NamespaceDetails::IndexIterator::IndexIterator(const NamespaceDetails* _d, - bool includeBackgroundInProgress) { - d = _d; - i = 0; - n = d->nIndexes; - if (includeBackgroundInProgress) - n += d->indexBuildsInProgress; -} - -// must be called when renaming a NS to fix up extra -void NamespaceDetails::copyingFrom(OperationContext* opCtx, - StringData thisns, - NamespaceIndex& ni, - NamespaceDetails* src) { - _extraOffset = 0; // we are a copy -- the old value is wrong. fixing it up below. - Extra* se = src->extra(); - int n = NIndexesBase; - if (se) { - Extra* e = allocExtra(opCtx, thisns, ni, n); - while (1) { - n += NIndexesExtra; - e->copy(this, *se); - se = se->next(src); - if (se == 0) - break; - Extra* nxt = allocExtra(opCtx, thisns, ni, n); - e->setNext(opCtx, nxt->ofsFrom(this)); - e = nxt; - } - verify(_extraOffset); - } -} - -NamespaceDetails* NamespaceDetails::writingWithoutExtra(OperationContext* opCtx) { - return opCtx->recoveryUnit()->writing(this); -} - - -// XXX - this method should go away -NamespaceDetails* NamespaceDetails::writingWithExtra(OperationContext* opCtx) { - for (Extra* e = extra(); e; e = e->next(this)) { - opCtx->recoveryUnit()->writing(e); - } - return writingWithoutExtra(opCtx); -} - -void NamespaceDetails::setMaxCappedDocs(OperationContext* opCtx, long long max) { - massert(16499, - "max in a capped collection has to be < 2^31 or -1", - CollectionOptions::validMaxCappedDocs(&max)); - maxDocsInCapped = max; -} - -/* ------------------------------------------------------------------------- */ - - -int NamespaceDetails::_catalogFindIndexByName(OperationContext* opCtx, - const Collection* coll, - StringData name, - bool includeBackgroundInProgress) const { - IndexIterator i = ii(includeBackgroundInProgress); - while (i.more()) { - const BSONObj obj = coll->docFor(opCtx, i.next().info.toRecordId()).value(); - if (name == obj.getStringField("name")) - return i.pos() - 1; - } - return -1; -} - -void NamespaceDetails::Extra::setNext(OperationContext* opCtx, long ofs) { - *opCtx->recoveryUnit()->writing(&_next) = ofs; -} - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details.h deleted file mode 100644 index cf82703a25d..00000000000 --- a/src/mongo/db/storage/mmap_v1/catalog/namespace_details.h +++ /dev/null @@ -1,254 +0,0 @@ -/** - * Copyright (C) 2008 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/base/static_assert.h" -#include "mongo/db/namespace_string.h" -#include "mongo/db/storage/mmap_v1/catalog/index_details.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace.h" -#include "mongo/db/storage/mmap_v1/diskloc.h" - -namespace mongo { - -class Collection; -class NamespaceIndex; -class OperationContext; - -#pragma pack(1) -/* NamespaceDetails : this is the "header" for a collection that has all its details. - It's in the .ns file and this is a memory mapped region (thus the pack pragma above). -*/ -class NamespaceDetails { -public: - enum { NIndexesMax = 64, NIndexesExtra = 30, NIndexesBase = 10 }; - - // deleted lists -- linked lists of deleted records -- are placed in 'buckets' of various - // sizes so you can look for a deleted record of about the right size. These buckets are - // split into small and large groups for compatibility with old versions. - static const int SmallBuckets = 18; - static const int LargeBuckets = 8; - - - /*-------- data fields, as present on disk : */ - - DiskLoc firstExtent; - DiskLoc lastExtent; - - /* NOTE: capped collections v1 override the meaning of deletedList. - deletedList[0] points to a list of free records (DeletedRecord's) for all extents in - the capped namespace. - deletedList[1] points to the last record in the prev extent. When the "current extent" - changes, this value is updated. !deletedList[1].isValid() when this value is not - yet computed. - */ - DiskLoc deletedListSmall[SmallBuckets]; - DiskLoc deletedListLegacyGrabBag; // old implementations put records of multiple sizes here. - - // ofs 168 (8 byte aligned) - struct Stats { - // datasize and nrecords MUST Be adjacent code assumes! - long long datasize; // this includes padding, but not record headers - long long nrecords; - } stats; - - - int lastExtentSize; - - int nIndexes; - - // ofs 192 - IndexDetails _indexes[NIndexesBase]; - -public: - // ofs 352 (16 byte aligned) - int isCapped; // there is wasted space here if I'm right (ERH) - - int maxDocsInCapped; // max # of objects for a capped table, -1 for inf. - - double paddingFactorOldDoNotUse; - // ofs 368 (16) - int systemFlagsOldDoNotUse; // things that the system sets/cares about - - DiskLoc capExtent; // the "current" extent we're writing too for a capped collection - DiskLoc capFirstNewRecord; - - // NamespaceDetails version. So we can do backward compatibility in the future. See filever.h - unsigned short _dataFileVersion; - unsigned short _indexFileVersion; - - unsigned long long multiKeyIndexBits; - - // ofs 400 (16) - unsigned long long _reservedA; - long long _extraOffset; // where the $extra info is located (bytes relative to this) - -public: - int indexBuildsInProgress; // Number of indexes currently being built - - int userFlags; - - DiskLoc deletedListLarge[LargeBuckets]; - - // Think carefully before using this. We need at least 8 bytes reserved to leave room for a - // DiskLoc pointing to more data (eg in a dummy MmapV1RecordHeader or Extent). There is still - // _reservedA above, but these are the final two reserved 8-byte regions. - char _reserved[8]; - /*-------- end data 496 bytes */ -public: - explicit NamespaceDetails(const DiskLoc& loc, bool _capped); - - class Extra { - long long _next; - - public: - IndexDetails details[NIndexesExtra]; - - private: - unsigned reserved2; - unsigned reserved3; - Extra(const Extra&) { - verify(false); - } - Extra& operator=(const Extra& r) { - verify(false); - return *this; - } - - public: - Extra() {} - long ofsFrom(NamespaceDetails* d) { - return ((char*)this) - ((char*)d); - } - void init() { - memset(this, 0, sizeof(Extra)); - } - Extra* next(const NamespaceDetails* d) const { - if (_next == 0) - return 0; - return (Extra*)(((char*)d) + _next); - } - void setNext(OperationContext* opCtx, long ofs); - void copy(NamespaceDetails* d, const Extra& e) { - memcpy(this, &e, sizeof(Extra)); - _next = 0; - } - }; - Extra* extra() const { - if (_extraOffset == 0) - return 0; - return (Extra*)(((char*)this) + _extraOffset); - } - /* add extra space for indexes when more than 10 */ - Extra* allocExtra(OperationContext* opCtx, - StringData ns, - NamespaceIndex& ni, - int nindexessofar); - - void copyingFrom(OperationContext* opCtx, - StringData thisns, - NamespaceIndex& ni, - NamespaceDetails* src); // must be called when renaming a NS to fix up extra - -public: - void setMaxCappedDocs(OperationContext* opCtx, long long max); - - enum UserFlags { - Flag_UsePowerOf2Sizes = 1 << 0, - Flag_NoPadding = 1 << 1, - }; - - IndexDetails& idx(int idxNo, bool missingExpected = false); - const IndexDetails& idx(int idxNo, bool missingExpected = false) const; - - class IndexIterator { - public: - int pos() { - return i; - } // note this is the next one to come - bool more() { - return i < n; - } - const IndexDetails& next() { - return d->idx(i++); - } - - private: - friend class NamespaceDetails; - int i, n; - const NamespaceDetails* d; - IndexIterator(const NamespaceDetails* _d, bool includeBackgroundInProgress); - }; - - IndexIterator ii(bool includeBackgroundInProgress = false) const { - return IndexIterator(this, includeBackgroundInProgress); - } - - /** - * This fetches the IndexDetails for the next empty index slot. The caller must populate - * returned object. This handles allocating extra index space, if necessary. - */ - IndexDetails& getNextIndexDetails(OperationContext* opCtx, Collection* collection); - - NamespaceDetails* writingWithoutExtra(OperationContext* opCtx); - - /** Make all linked Extra objects writeable as well */ - NamespaceDetails* writingWithExtra(OperationContext* opCtx); - - /** - * Returns the offset of the specified index name within the array of indexes. Must be - * passed-in the owning collection to resolve the index record entries to objects. - * - * @return > 0 if index name was found, -1 otherwise. - */ - int _catalogFindIndexByName(OperationContext* opCtx, - const Collection* coll, - StringData name, - bool includeBackgroundInProgress) const; - -private: - /** - * swaps all meta data for 2 indexes - * a and b are 2 index ids, whose contents will be swapped - * must have a lock on the entire collection to do this - */ - void swapIndex(OperationContext* opCtx, int a, int b); - - friend class IndexCatalog; - friend class IndexCatalogEntry; - - /** Update cappedLastDelRecLastExtent() after capExtent changed in cappedTruncateAfter() */ - void cappedTruncateLastDelUpdate(); - MONGO_STATIC_ASSERT(NIndexesMax <= NIndexesBase + NIndexesExtra * 2); - MONGO_STATIC_ASSERT(NIndexesMax <= 64); // multiKey bits - MONGO_STATIC_ASSERT(sizeof(NamespaceDetails::Extra) == 496); -}; // NamespaceDetails -MONGO_STATIC_ASSERT(sizeof(NamespaceDetails) == 496); -#pragma pack() - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp deleted file mode 100644 index a61effdf205..00000000000 --- a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.cpp +++ /dev/null @@ -1,488 +0,0 @@ -// namespace_details_collection_entry.h - -/** -* Copyright (C) 2014 MongoDB Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h" - -#include "mongo/db/catalog/database.h" -#include "mongo/db/catalog/database_holder.h" -#include "mongo/db/catalog/uuid_catalog.h" -#include "mongo/db/index/index_descriptor.h" -#include "mongo/db/ops/update.h" -#include "mongo/db/record_id.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h" -#include "mongo/db/storage/mmap_v1/data_file.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h" -#include "mongo/db/storage/record_store.h" -#include "mongo/util/log.h" -#include "mongo/util/startup_test.h" - -namespace mongo { - -using std::string; - -NamespaceDetailsCollectionCatalogEntry::NamespaceDetailsCollectionCatalogEntry( - StringData ns, - NamespaceDetails* details, - RecordStore* namespacesRecordStore, - RecordId namespacesRecordId, - RecordStore* indexRecordStore, - MMAPV1DatabaseCatalogEntry* db) - : CollectionCatalogEntry(ns), - _details(details), - _namespacesRecordStore(namespacesRecordStore), - _indexRecordStore(indexRecordStore), - _db(db) { - setNamespacesRecordId(nullptr, namespacesRecordId); -} - -CollectionOptions NamespaceDetailsCollectionCatalogEntry::getCollectionOptions( - OperationContext* opCtx) const { - CollectionOptions options = _db->getCollectionOptions(opCtx, _namespacesRecordId); - - if (options.flagsSet) { - if (options.flags != _details->userFlags) { - warning() << "system.namespaces and NamespaceDetails disagree about userFlags." - << " system.namespaces: " << options.flags - << " NamespaceDetails: " << _details->userFlags; - dassert(options.flags == _details->userFlags); - } - } - - // Fill in the actual flags from the NamespaceDetails. - // Leaving flagsSet alone since it indicates whether the user actively set the flags. - options.flags = _details->userFlags; - - return options; -} - -int NamespaceDetailsCollectionCatalogEntry::getTotalIndexCount(OperationContext* opCtx) const { - return _details->nIndexes + _details->indexBuildsInProgress; -} - -int NamespaceDetailsCollectionCatalogEntry::getCompletedIndexCount(OperationContext* opCtx) const { - return _details->nIndexes; -} - -int NamespaceDetailsCollectionCatalogEntry::getMaxAllowedIndexes() const { - return NamespaceDetails::NIndexesMax; -} - -void NamespaceDetailsCollectionCatalogEntry::getAllIndexes(OperationContext* opCtx, - std::vector<std::string>* names) const { - NamespaceDetails::IndexIterator i = _details->ii(true); - while (i.more()) { - const IndexDetails& id = i.next(); - const BSONObj obj = _indexRecordStore->dataFor(opCtx, id.info.toRecordId()).toBson(); - names->push_back(obj.getStringField("name")); - } -} - -void NamespaceDetailsCollectionCatalogEntry::getReadyIndexes( - OperationContext* opCtx, std::vector<std::string>* names) const { - NamespaceDetails::IndexIterator i = _details->ii(true); - while (i.more()) { - const IndexDetails& id = i.next(); - const BSONObj obj = _indexRecordStore->dataFor(opCtx, id.info.toRecordId()).toBson(); - const char* idxName = obj.getStringField("name"); - if (isIndexReady(opCtx, StringData(idxName))) { - names->push_back(idxName); - } - } -} - -bool NamespaceDetailsCollectionCatalogEntry::isIndexMultikey(OperationContext* opCtx, - StringData idxName, - MultikeyPaths* multikeyPaths) const { - // TODO SERVER-22727: Populate 'multikeyPaths' with path components that cause 'idxName' to be - // multikey. - int idxNo = _findIndexNumber(opCtx, idxName); - invariant(idxNo >= 0); - return isIndexMultikey(idxNo); -} - -bool NamespaceDetailsCollectionCatalogEntry::isIndexMultikey(int idxNo) const { - return (_details->multiKeyIndexBits & (((unsigned long long)1) << idxNo)) != 0; -} - -bool NamespaceDetailsCollectionCatalogEntry::setIndexIsMultikey( - OperationContext* opCtx, StringData indexName, const MultikeyPaths& multikeyPaths) { - // TODO SERVER-22727: Store new path components from 'multikeyPaths' that cause 'indexName' to - // be multikey. - int idxNo = _findIndexNumber(opCtx, indexName); - invariant(idxNo >= 0); - const bool multikey = true; - return setIndexIsMultikey(opCtx, idxNo, multikey); -} - -bool NamespaceDetailsCollectionCatalogEntry::setIndexIsMultikey(OperationContext* opCtx, - int idxNo, - bool multikey) { - unsigned long long mask = 1ULL << idxNo; - - if (multikey) { - // Shortcut if the bit is already set correctly - if (_details->multiKeyIndexBits & mask) { - return false; - } - - *opCtx->recoveryUnit()->writing(&_details->multiKeyIndexBits) |= mask; - } else { - // Shortcut if the bit is already set correctly - if (!(_details->multiKeyIndexBits & mask)) { - return false; - } - - // Invert mask: all 1's except a 0 at the ith bit - mask = ~mask; - *opCtx->recoveryUnit()->writing(&_details->multiKeyIndexBits) &= mask; - } - - return true; -} - -RecordId NamespaceDetailsCollectionCatalogEntry::getIndexHead(OperationContext* opCtx, - StringData idxName) const { - int idxNo = _findIndexNumber(opCtx, idxName); - invariant(idxNo >= 0); - return _details->idx(idxNo).head.toRecordId(); -} - -BSONObj NamespaceDetailsCollectionCatalogEntry::getIndexSpec(OperationContext* opCtx, - StringData idxName) const { - int idxNo = _findIndexNumber(opCtx, idxName); - invariant(idxNo >= 0); - const IndexDetails& id = _details->idx(idxNo); - return _indexRecordStore->dataFor(opCtx, id.info.toRecordId()).toBson(); -} - -void NamespaceDetailsCollectionCatalogEntry::setIndexHead(OperationContext* opCtx, - StringData idxName, - const RecordId& newHead) { - int idxNo = _findIndexNumber(opCtx, idxName); - invariant(idxNo >= 0); - *opCtx->recoveryUnit()->writing(&_details->idx(idxNo).head) = DiskLoc::fromRecordId(newHead); -} - -bool NamespaceDetailsCollectionCatalogEntry::isIndexReady(OperationContext* opCtx, - StringData idxName) const { - int idxNo = _findIndexNumber(opCtx, idxName); - invariant(idxNo >= 0); - return idxNo < getCompletedIndexCount(opCtx); -} - -KVPrefix NamespaceDetailsCollectionCatalogEntry::getIndexPrefix(OperationContext* opCtx, - StringData indexName) const { - return KVPrefix::kNotPrefixed; -} - -int NamespaceDetailsCollectionCatalogEntry::_findIndexNumber(OperationContext* opCtx, - StringData idxName) const { - NamespaceDetails::IndexIterator i = _details->ii(true); - while (i.more()) { - const IndexDetails& id = i.next(); - int idxNo = i.pos() - 1; - const BSONObj obj = _indexRecordStore->dataFor(opCtx, id.info.toRecordId()).toBson(); - if (idxName == obj.getStringField("name")) - return idxNo; - } - return -1; -} - -/* remove bit from a bit array - actually remove its slot, not a clear - note: this function does not work with x == 63 -- that is ok - but keep in mind in the future if max indexes were extended to - exactly 64 it would be a problem -*/ -unsigned long long removeAndSlideBit(unsigned long long b, int x) { - unsigned long long tmp = b; - return (tmp & ((((unsigned long long)1) << x) - 1)) | ((tmp >> (x + 1)) << x); -} - -class IndexUpdateTest : public StartupTest { -public: - void run() { - verify(removeAndSlideBit(1, 0) == 0); - verify(removeAndSlideBit(2, 0) == 1); - verify(removeAndSlideBit(2, 1) == 0); - verify(removeAndSlideBit(255, 1) == 127); - verify(removeAndSlideBit(21, 2) == 9); - verify(removeAndSlideBit(0x4000000000000001ULL, 62) == 1); - } -} iu_unittest; - -Status NamespaceDetailsCollectionCatalogEntry::removeIndex(OperationContext* opCtx, - StringData indexName) { - int idxNo = _findIndexNumber(opCtx, indexName); - if (idxNo < 0) - return Status(ErrorCodes::NamespaceNotFound, "index not found to remove"); - - RecordId infoLocation = _details->idx(idxNo).info.toRecordId(); - - { // sanity check - BSONObj info = _indexRecordStore->dataFor(opCtx, infoLocation).toBson(); - invariant(info["name"].String() == indexName); - } - - { // drop the namespace - string indexNamespace = IndexDescriptor::makeIndexNamespace(ns().ns(), indexName); - Status status = _db->dropCollection(opCtx, indexNamespace); - if (!status.isOK()) { - return status; - } - } - - { // all info in the .ns file - NamespaceDetails* d = _details->writingWithExtra(opCtx); - - // fix the _multiKeyIndexBits, by moving all bits above me down one - d->multiKeyIndexBits = removeAndSlideBit(d->multiKeyIndexBits, idxNo); - - if (idxNo >= d->nIndexes) - d->indexBuildsInProgress--; - else - d->nIndexes--; - - for (int i = idxNo; i < getTotalIndexCount(opCtx); i++) - d->idx(i) = d->idx(i + 1); - - d->idx(getTotalIndexCount(opCtx)) = IndexDetails(); - } - - // Someone may be querying the system.indexes namespace directly, so we need to invalidate - // its cursors. - MMAPV1DatabaseCatalogEntry::invalidateSystemCollectionRecord( - opCtx, NamespaceString(_db->name(), "system.indexes"), infoLocation); - - // remove from system.indexes - _indexRecordStore->deleteRecord(opCtx, infoLocation); - - return Status::OK(); -} - -Status NamespaceDetailsCollectionCatalogEntry::prepareForIndexBuild( - OperationContext* opCtx, const IndexDescriptor* desc, bool isBackgroundSecondaryBuild) { - BSONObj spec = desc->infoObj(); - // 1) entry in system.indexs - // TODO SERVER-30638: using timestamp 0 for these inserts. - StatusWith<RecordId> systemIndexesEntry = - _indexRecordStore->insertRecord(opCtx, spec.objdata(), spec.objsize(), Timestamp(), false); - if (!systemIndexesEntry.isOK()) - return systemIndexesEntry.getStatus(); - - // 2) NamespaceDetails mods - IndexDetails* id; - try { - id = &_details->idx(getTotalIndexCount(opCtx), true); - } catch (DBException&) { - _details->allocExtra(opCtx, ns().ns(), _db->_namespaceIndex, getTotalIndexCount(opCtx)); - id = &_details->idx(getTotalIndexCount(opCtx), false); - } - - const DiskLoc infoLoc = DiskLoc::fromRecordId(systemIndexesEntry.getValue()); - *opCtx->recoveryUnit()->writing(&id->info) = infoLoc; - *opCtx->recoveryUnit()->writing(&id->head) = DiskLoc(); - - opCtx->recoveryUnit()->writingInt(_details->indexBuildsInProgress) += 1; - - // 3) indexes entry in .ns file and system.namespaces - _db->createNamespaceForIndex(opCtx, desc->indexNamespace()); - - // TODO SERVER-22727: Create an entry for path-level multikey info when creating the new index. - - // Mark the collation feature as in use if the index has a non-simple collation. - if (spec["collation"]) { - _db->markCollationFeatureAsInUse(opCtx); - } - - return Status::OK(); -} - -void NamespaceDetailsCollectionCatalogEntry::indexBuildSuccess(OperationContext* opCtx, - StringData indexName) { - int idxNo = _findIndexNumber(opCtx, indexName); - fassert(17202, idxNo >= 0); - - // Make sure the newly created index is relocated to nIndexes, if it isn't already there - if (idxNo != getCompletedIndexCount(opCtx)) { - int toIdxNo = getCompletedIndexCount(opCtx); - - //_details->swapIndex( opCtx, idxNo, toIdxNo ); - - // flip main meta data - IndexDetails temp = _details->idx(idxNo); - *opCtx->recoveryUnit()->writing(&_details->idx(idxNo)) = _details->idx(toIdxNo); - *opCtx->recoveryUnit()->writing(&_details->idx(toIdxNo)) = temp; - - // flip multi key bits - bool tempMultikey = isIndexMultikey(idxNo); - setIndexIsMultikey(opCtx, idxNo, isIndexMultikey(toIdxNo)); - setIndexIsMultikey(opCtx, toIdxNo, tempMultikey); - - idxNo = toIdxNo; - invariant((idxNo == _findIndexNumber(opCtx, indexName))); - } - - opCtx->recoveryUnit()->writingInt(_details->indexBuildsInProgress) -= 1; - opCtx->recoveryUnit()->writingInt(_details->nIndexes) += 1; - - invariant(isIndexReady(opCtx, indexName)); -} - -void NamespaceDetailsCollectionCatalogEntry::updateTTLSetting(OperationContext* opCtx, - StringData idxName, - long long newExpireSeconds) { - int idx = _findIndexNumber(opCtx, idxName); - invariant(idx >= 0); - - IndexDetails& indexDetails = _details->idx(idx); - - BSONObj obj = _indexRecordStore->dataFor(opCtx, indexDetails.info.toRecordId()).toBson(); - const BSONElement oldExpireSecs = obj.getField("expireAfterSeconds"); - - // Important that we set the new value in-place. We are writing directly to the - // object here so must be careful not to overwrite with a longer numeric type. - - char* nonConstPtr = const_cast<char*>(oldExpireSecs.value()); - switch (oldExpireSecs.type()) { - case EOO: - massert(16631, "index does not have an 'expireAfterSeconds' field", false); - break; - case NumberInt: - *opCtx->recoveryUnit()->writing(reinterpret_cast<int*>(nonConstPtr)) = newExpireSeconds; - break; - case NumberDouble: - *opCtx->recoveryUnit()->writing(reinterpret_cast<double*>(nonConstPtr)) = - newExpireSeconds; - break; - case NumberLong: - *opCtx->recoveryUnit()->writing(reinterpret_cast<long long*>(nonConstPtr)) = - newExpireSeconds; - break; - default: - massert(16632, "current 'expireAfterSeconds' is not a number", false); - } -} - -void NamespaceDetailsCollectionCatalogEntry::_updateSystemNamespaces(OperationContext* opCtx, - const BSONObj& update) { - if (!_namespacesRecordStore) - return; - - RecordData entry = _namespacesRecordStore->dataFor(opCtx, _namespacesRecordId); - const BSONObj newEntry = applyUpdateOperators(opCtx, entry.releaseToBson(), update); - - Status result = _namespacesRecordStore->updateRecord( - opCtx, _namespacesRecordId, newEntry.objdata(), newEntry.objsize(), false, NULL); - - if (ErrorCodes::NeedsDocumentMove == result) { - // TODO SERVER-30638: using timestamp 0 for these inserts. - StatusWith<RecordId> newLocation = _namespacesRecordStore->insertRecord( - opCtx, newEntry.objdata(), newEntry.objsize(), Timestamp(), false); - fassert(40074, newLocation.getStatus().isOK()); - - // Invalidate old namespace record - MMAPV1DatabaseCatalogEntry::invalidateSystemCollectionRecord( - opCtx, NamespaceString(_db->name(), "system.namespaces"), _namespacesRecordId); - - _namespacesRecordStore->deleteRecord(opCtx, _namespacesRecordId); - - setNamespacesRecordId(opCtx, newLocation.getValue()); - } else { - fassert(17486, result.isOK()); - } -} - -void NamespaceDetailsCollectionCatalogEntry::updateFlags(OperationContext* opCtx, int newValue) { - NamespaceDetailsRSV1MetaData md(ns().ns(), _details); - md.replaceUserFlags(opCtx, newValue); - _updateSystemNamespaces(opCtx, BSON("$set" << BSON("options.flags" << newValue))); -} - -bool NamespaceDetailsCollectionCatalogEntry::isEqualToMetadataUUID(OperationContext* opCtx, - OptionalCollectionUUID uuid) { - if (ns().coll() == "system.namespaces") { - return true; - } - RecordData namespaceData; - invariant(_namespacesRecordStore->findRecord(opCtx, _namespacesRecordId, &namespaceData)); - - auto namespacesBson = namespaceData.releaseToBson(); - if (ns().coll() == "system.indexes") { - return !uuid && (!namespacesBson["options"].isABSONObj() || - namespacesBson["options"].Obj()["uuid"].eoo()); - } - auto optionsObj = namespacesBson["options"].Obj(); - return !optionsObj["uuid"].eoo() && UUID::parse(optionsObj["uuid"]).getValue() == uuid; -} - -void NamespaceDetailsCollectionCatalogEntry::updateValidator(OperationContext* opCtx, - const BSONObj& validator, - StringData validationLevel, - StringData validationAction) { - _updateSystemNamespaces( - opCtx, - BSON("$set" << BSON("options.validator" << validator << "options.validationLevel" - << validationLevel - << "options.validationAction" - << validationAction))); -} - -void NamespaceDetailsCollectionCatalogEntry::setIsTemp(OperationContext* opCtx, bool isTemp) { - _updateSystemNamespaces(opCtx, BSON("$set" << BSON("options.temp" << isTemp))); -} - - -void NamespaceDetailsCollectionCatalogEntry::setNamespacesRecordId(OperationContext* opCtx, - RecordId newId) { - if (newId.isNull()) { - invariant(ns().coll() == "system.namespaces" || ns().coll() == "system.indexes"); - } else { - // 'opCtx' is allowed to be null, but we don't need an OperationContext in MMAP, so that's - // OK. - auto namespaceEntry = _namespacesRecordStore->dataFor(opCtx, newId).releaseToBson(); - invariant(namespaceEntry["name"].String() == ns().ns()); - - // Register RecordId change for rollback if we're not initializing. - if (opCtx && !_namespacesRecordId.isNull()) { - auto oldNamespacesRecordId = _namespacesRecordId; - opCtx->recoveryUnit()->onRollback([=] { _namespacesRecordId = oldNamespacesRecordId; }); - } - _namespacesRecordId = newId; - } -} - -void NamespaceDetailsCollectionCatalogEntry::updateCappedSize(OperationContext* opCtx, - long long size) { - MONGO_UNREACHABLE; -} -} diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h deleted file mode 100644 index 3c349a890cc..00000000000 --- a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h +++ /dev/null @@ -1,147 +0,0 @@ -// namespace_details_collection_entry.h - -#pragma once - -/** -* Copyright (C) 2014 MongoDB Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#include "mongo/base/string_data.h" -#include "mongo/bson/bsonobj.h" -#include "mongo/db/catalog/collection_catalog_entry.h" -#include "mongo/db/server_options.h" -#include "mongo/db/storage/mmap_v1/diskloc.h" - -namespace mongo { - -class NamespaceDetails; - -class MMAPV1DatabaseCatalogEntry; -; -class RecordStore; -class OperationContext; - -class NamespaceDetailsCollectionCatalogEntry : public CollectionCatalogEntry { -public: - NamespaceDetailsCollectionCatalogEntry(StringData ns, - NamespaceDetails* details, - RecordStore* namespacesRecordStore, - RecordId namespacesRecordId, - RecordStore* indexRecordStore, - MMAPV1DatabaseCatalogEntry* db); - - ~NamespaceDetailsCollectionCatalogEntry() {} - - CollectionOptions getCollectionOptions(OperationContext* opCtx) const final; - - int getTotalIndexCount(OperationContext* opCtx) const final; - - int getCompletedIndexCount(OperationContext* opCtx) const final; - - int getMaxAllowedIndexes() const final; - - void getAllIndexes(OperationContext* opCtx, std::vector<std::string>* names) const final; - - void getReadyIndexes(OperationContext* opCtx, std::vector<std::string>* names) const final; - - BSONObj getIndexSpec(OperationContext* opCtx, StringData idxName) const final; - - bool isIndexMultikey(OperationContext* opCtx, - StringData indexName, - MultikeyPaths* multikeyPaths) const final; - bool isIndexMultikey(int idxNo) const; - - bool setIndexIsMultikey(OperationContext* opCtx, int idxNo, bool multikey = true); - bool setIndexIsMultikey(OperationContext* opCtx, - StringData indexName, - const MultikeyPaths& multikeyPaths) final; - - RecordId getIndexHead(OperationContext* opCtx, StringData indexName) const final; - - void setIndexHead(OperationContext* opCtx, StringData indexName, const RecordId& newHead) final; - - bool isIndexReady(OperationContext* opCtx, StringData indexName) const final; - - KVPrefix getIndexPrefix(OperationContext* opCtx, StringData indexName) const final; - - Status removeIndex(OperationContext* opCtx, StringData indexName) final; - - Status prepareForIndexBuild(OperationContext* opCtx, - const IndexDescriptor* spec, - bool isBackgroundSecondaryBuild) final; - - void indexBuildSuccess(OperationContext* opCtx, StringData indexName) final; - - void updateTTLSetting(OperationContext* opCtx, - StringData idxName, - long long newExpireSeconds) final; - - void updateFlags(OperationContext* opCtx, int newValue) final; - - bool isEqualToMetadataUUID(OperationContext* opCtx, OptionalCollectionUUID uuid); - - void updateValidator(OperationContext* opCtx, - const BSONObj& validator, - StringData validationLevel, - StringData validationAction) final; - - void setIsTemp(OperationContext* opCtx, bool isTemp) final; - - void updateCappedSize(OperationContext* opCtx, long long size) final; - - // not part of interface, but available to my storage engine - - int _findIndexNumber(OperationContext* opCtx, StringData indexName) const; - - RecordId getNamespacesRecordId() { - return _namespacesRecordId; - } - - /** - * 'opCtx' is only allowed to be null when called from the constructor. - */ - void setNamespacesRecordId(OperationContext* opCtx, RecordId newId); - -private: - NamespaceDetails* _details; - RecordStore* _namespacesRecordStore; - - // Where this entry lives in the _namespacesRecordStore. - RecordId _namespacesRecordId; - - RecordStore* _indexRecordStore; - MMAPV1DatabaseCatalogEntry* _db; - - /** - * Updates the entry for this namespace in '_namespacesRecordStore', updating - * '_namespacesRecordId' if necessary. - */ - void _updateSystemNamespaces(OperationContext* opCtx, const BSONObj& update); - - friend class MMAPV1DatabaseCatalogEntry; -}; -} diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp deleted file mode 100644 index 7d5f1805d68..00000000000 --- a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.cpp +++ /dev/null @@ -1,200 +0,0 @@ -// namespace_details_rsv1_metadata.cpp - -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h" - -#include "mongo/base/static_assert.h" -#include "mongo/db/operation_context.h" - -namespace mongo { - -using std::unique_ptr; -using std::numeric_limits; - -MONGO_STATIC_ASSERT(RecordStoreV1Base::Buckets == - NamespaceDetails::SmallBuckets + NamespaceDetails::LargeBuckets); - -NamespaceDetailsRSV1MetaData::NamespaceDetailsRSV1MetaData(StringData ns, NamespaceDetails* details) - : _ns(ns.toString()), _details(details) {} - -const DiskLoc& NamespaceDetailsRSV1MetaData::capExtent() const { - return _details->capExtent; -} - -void NamespaceDetailsRSV1MetaData::setCapExtent(OperationContext* opCtx, const DiskLoc& loc) { - *opCtx->recoveryUnit()->writing(&_details->capExtent) = loc; -} - -const DiskLoc& NamespaceDetailsRSV1MetaData::capFirstNewRecord() const { - return _details->capFirstNewRecord; -} - -void NamespaceDetailsRSV1MetaData::setCapFirstNewRecord(OperationContext* opCtx, - const DiskLoc& loc) { - *opCtx->recoveryUnit()->writing(&_details->capFirstNewRecord) = loc; -} - -bool NamespaceDetailsRSV1MetaData::capLooped() const { - return _details->capFirstNewRecord.isValid(); -} - -long long NamespaceDetailsRSV1MetaData::dataSize() const { - return _details->stats.datasize; -} -long long NamespaceDetailsRSV1MetaData::numRecords() const { - return _details->stats.nrecords; -} - -void NamespaceDetailsRSV1MetaData::incrementStats(OperationContext* opCtx, - long long dataSizeIncrement, - long long numRecordsIncrement) { - // durability todo : this could be a bit annoying / slow to record constantly - NamespaceDetails::Stats* s = opCtx->recoveryUnit()->writing(&_details->stats); - s->datasize += dataSizeIncrement; - s->nrecords += numRecordsIncrement; -} - -void NamespaceDetailsRSV1MetaData::setStats(OperationContext* opCtx, - long long dataSize, - long long numRecords) { - NamespaceDetails::Stats* s = opCtx->recoveryUnit()->writing(&_details->stats); - s->datasize = dataSize; - s->nrecords = numRecords; -} - -DiskLoc NamespaceDetailsRSV1MetaData::deletedListEntry(int bucket) const { - invariant(bucket >= 0 && bucket < RecordStoreV1Base::Buckets); - const DiskLoc head = (bucket < NamespaceDetails::SmallBuckets) - ? _details->deletedListSmall[bucket] - : _details->deletedListLarge[bucket - NamespaceDetails::SmallBuckets]; - - if (head == DiskLoc(0, 0)) { - // This will happen the first time we use a "large" bucket since they were previously - // zero-initialized. - return DiskLoc(); - } - - return head; -} - -void NamespaceDetailsRSV1MetaData::setDeletedListEntry(OperationContext* opCtx, - int bucket, - const DiskLoc& loc) { - DiskLoc* head = (bucket < NamespaceDetails::SmallBuckets) - ? &_details->deletedListSmall[bucket] - : &_details->deletedListLarge[bucket - NamespaceDetails::SmallBuckets]; - *opCtx->recoveryUnit()->writing(head) = loc; -} - -DiskLoc NamespaceDetailsRSV1MetaData::deletedListLegacyGrabBag() const { - return _details->deletedListLegacyGrabBag; -} - -void NamespaceDetailsRSV1MetaData::setDeletedListLegacyGrabBag(OperationContext* opCtx, - const DiskLoc& loc) { - *opCtx->recoveryUnit()->writing(&_details->deletedListLegacyGrabBag) = loc; -} - -void NamespaceDetailsRSV1MetaData::orphanDeletedList(OperationContext* opCtx) { - for (int i = 0; i < RecordStoreV1Base::Buckets; i++) { - setDeletedListEntry(opCtx, i, DiskLoc()); - } - setDeletedListLegacyGrabBag(opCtx, DiskLoc()); -} - -const DiskLoc& NamespaceDetailsRSV1MetaData::firstExtent(OperationContext* opCtx) const { - return _details->firstExtent; -} - -void NamespaceDetailsRSV1MetaData::setFirstExtent(OperationContext* opCtx, const DiskLoc& loc) { - *opCtx->recoveryUnit()->writing(&_details->firstExtent) = loc; -} - -const DiskLoc& NamespaceDetailsRSV1MetaData::lastExtent(OperationContext* opCtx) const { - return _details->lastExtent; -} - -void NamespaceDetailsRSV1MetaData::setLastExtent(OperationContext* opCtx, const DiskLoc& loc) { - *opCtx->recoveryUnit()->writing(&_details->lastExtent) = loc; -} - -bool NamespaceDetailsRSV1MetaData::isCapped() const { - return _details->isCapped; -} - -bool NamespaceDetailsRSV1MetaData::isUserFlagSet(int flag) const { - return _details->userFlags & flag; -} - -int NamespaceDetailsRSV1MetaData::userFlags() const { - return _details->userFlags; -} - -bool NamespaceDetailsRSV1MetaData::setUserFlag(OperationContext* opCtx, int flag) { - if ((_details->userFlags & flag) == flag) - return false; - - opCtx->recoveryUnit()->writingInt(_details->userFlags) |= flag; - return true; -} - -bool NamespaceDetailsRSV1MetaData::clearUserFlag(OperationContext* opCtx, int flag) { - if ((_details->userFlags & flag) == 0) - return false; - - opCtx->recoveryUnit()->writingInt(_details->userFlags) &= ~flag; - return true; -} - -bool NamespaceDetailsRSV1MetaData::replaceUserFlags(OperationContext* opCtx, int flags) { - if (_details->userFlags == flags) - return false; - - opCtx->recoveryUnit()->writingInt(_details->userFlags) = flags; - return true; -} - -int NamespaceDetailsRSV1MetaData::lastExtentSize(OperationContext* opCtx) const { - return _details->lastExtentSize; -} - -void NamespaceDetailsRSV1MetaData::setLastExtentSize(OperationContext* opCtx, int newMax) { - if (_details->lastExtentSize == newMax) - return; - opCtx->recoveryUnit()->writingInt(_details->lastExtentSize) = newMax; -} - -long long NamespaceDetailsRSV1MetaData::maxCappedDocs() const { - invariant(_details->isCapped); - if (_details->maxDocsInCapped == 0x7fffffff) - return numeric_limits<long long>::max(); - return _details->maxDocsInCapped; -} -} diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h deleted file mode 100644 index 26f0a16803f..00000000000 --- a/src/mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h +++ /dev/null @@ -1,102 +0,0 @@ -// namespace_details_rsv1_metadata.h - -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include <string> - -#include "mongo/base/string_data.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_base.h" - -namespace mongo { - -class RecordStore; - -/* - * NOTE: NamespaceDetails will become a struct - * all dur, etc... will move here - */ -class NamespaceDetailsRSV1MetaData : public RecordStoreV1MetaData { -public: - explicit NamespaceDetailsRSV1MetaData(StringData ns, NamespaceDetails* details); - - virtual ~NamespaceDetailsRSV1MetaData() {} - - virtual const DiskLoc& capExtent() const; - virtual void setCapExtent(OperationContext* opCtx, const DiskLoc& loc); - - virtual const DiskLoc& capFirstNewRecord() const; - virtual void setCapFirstNewRecord(OperationContext* opCtx, const DiskLoc& loc); - - virtual bool capLooped() const; - - virtual long long dataSize() const; - virtual long long numRecords() const; - - virtual void incrementStats(OperationContext* opCtx, - long long dataSizeIncrement, - long long numRecordsIncrement); - - virtual void setStats(OperationContext* opCtx, long long dataSize, long long numRecords); - - virtual DiskLoc deletedListEntry(int bucket) const; - virtual void setDeletedListEntry(OperationContext* opCtx, int bucket, const DiskLoc& loc); - - virtual DiskLoc deletedListLegacyGrabBag() const; - virtual void setDeletedListLegacyGrabBag(OperationContext* opCtx, const DiskLoc& loc); - - virtual void orphanDeletedList(OperationContext* opCtx); - - virtual const DiskLoc& firstExtent(OperationContext* opCtx) const; - virtual void setFirstExtent(OperationContext* opCtx, const DiskLoc& loc); - - virtual const DiskLoc& lastExtent(OperationContext* opCtx) const; - virtual void setLastExtent(OperationContext* opCtx, const DiskLoc& loc); - - virtual bool isCapped() const; - - virtual bool isUserFlagSet(int flag) const; - virtual int userFlags() const; - virtual bool setUserFlag(OperationContext* opCtx, int flag); - virtual bool clearUserFlag(OperationContext* opCtx, int flag); - virtual bool replaceUserFlags(OperationContext* opCtx, int flags); - - virtual int lastExtentSize(OperationContext* opCtx) const; - virtual void setLastExtentSize(OperationContext* opCtx, int newMax); - - virtual long long maxCappedDocs() const; - -private: - std::string _ns; - NamespaceDetails* _details; - RecordStore* _namespaceRecordStore; -}; -} diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp deleted file mode 100644 index 90fce6f33f5..00000000000 --- a/src/mongo/db/storage/mmap_v1/catalog/namespace_index.cpp +++ /dev/null @@ -1,249 +0,0 @@ -// namespace_index.cpp - -/** - * Copyright (C) 2013 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/catalog/namespace_index.h" - -#include <boost/filesystem/operations.hpp> - -#include "mongo/db/operation_context.h" -#include "mongo/db/storage/mmap_v1/catalog/hashtab.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h" -#include "mongo/db/storage/mmap_v1/dur.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" -#include "mongo/util/exit.h" -#include "mongo/util/file.h" -#include "mongo/util/log.h" -#include "mongo/util/timer.h" - -namespace mongo { - -using std::endl; -using std::list; -using std::string; - -NamespaceIndex::NamespaceIndex(OperationContext* opCtx, - const std::string& dir, - const std::string& database) - : _dir(dir), _database(database), _f(opCtx, MongoFile::Options::SEQUENTIAL), _ht(nullptr) {} - -NamespaceIndex::~NamespaceIndex() {} - -NamespaceDetails* NamespaceIndex::details(StringData ns) const { - const Namespace n(ns); - return details(n); -} - -NamespaceDetails* NamespaceIndex::details(const Namespace& ns) const { - return _ht->get(ns); -} - -void NamespaceIndex::add_ns(OperationContext* opCtx, - StringData ns, - const DiskLoc& loc, - bool capped) { - NamespaceDetails details(loc, capped); - add_ns(opCtx, ns, &details); -} - -void NamespaceIndex::add_ns(OperationContext* opCtx, - StringData ns, - const NamespaceDetails* details) { - Namespace n(ns); - add_ns(opCtx, n, details); -} - -void NamespaceIndex::add_ns(OperationContext* opCtx, - const Namespace& ns, - const NamespaceDetails* details) { - const NamespaceString nss(ns.toString()); - invariant(opCtx->lockState()->isDbLockedForMode(nss.db(), MODE_X)); - - massert(17315, "no . in ns", nsIsFull(nss.toString())); - - uassert(10081, "too many namespaces/collections", _ht->put(opCtx, ns, *details)); -} - -void NamespaceIndex::kill_ns(OperationContext* opCtx, StringData ns) { - const NamespaceString nss(ns.toString()); - invariant(opCtx->lockState()->isDbLockedForMode(nss.db(), MODE_X)); - - const Namespace n(ns); - _ht->kill(opCtx, n); - - if (ns.size() <= Namespace::MaxNsCollectionLen) { - // Larger namespace names don't have room for $extras so they can't exist. The code - // below would cause an "$extra: ns too large" error and stacktrace to be printed to the - // log even though everything is fine. - for (int i = 0; i <= 1; i++) { - try { - Namespace extra(n.extraName(i)); - _ht->kill(opCtx, extra); - } catch (DBException&) { - LOG(3) << "caught exception in kill_ns" << endl; - } - } - } -} - -bool NamespaceIndex::pathExists() const { - return boost::filesystem::exists(path()); -} - -boost::filesystem::path NamespaceIndex::path() const { - boost::filesystem::path ret(_dir); - if (storageGlobalParams.directoryperdb) - ret /= _database; - ret /= (_database + ".ns"); - return ret; -} - -void NamespaceIndex::getCollectionNamespaces(list<string>* tofill) const { - _ht->iterAll([tofill](const Namespace& k, NamespaceDetails& v) { - if (!k.hasDollarSign() || k == "local.oplog.$main") { - // we call out local.oplog.$main specifically as its the only "normal" - // collection that has a $, so we make sure it gets added - tofill->push_back(k.toString()); - } - }); -} - -void NamespaceIndex::maybeMkdir() const { - if (!storageGlobalParams.directoryperdb) - return; - boost::filesystem::path dir(_dir); - dir /= _database; - if (!boost::filesystem::exists(dir)) - MONGO_ASSERT_ON_EXCEPTION_WITH_MSG(boost::filesystem::create_directory(dir), - "create dir for db "); -} - -void NamespaceIndex::init(OperationContext* opCtx) { - invariant(!_ht.get()); - - unsigned long long len = 0; - - const boost::filesystem::path nsPath = path(); - const std::string pathString = nsPath.string(); - - void* p = 0; - - if (boost::filesystem::exists(nsPath)) { - if (_f.open(opCtx, pathString)) { - len = _f.length(); - - if (len % (1024 * 1024) != 0) { - StringBuilder sb; - sb << "Invalid length: " << len << " for .ns file: " << pathString - << ". Cannot open database"; - - log() << sb.str(); - uassert(10079, sb.str(), len % (1024 * 1024) == 0); - } - - p = _f.getView(); - } - } else { - uassert(ErrorCodes::IllegalOperation, - "Cannot create a database in read-only mode.", - !storageGlobalParams.readOnly); - - // use mmapv1GlobalOptions.lenForNewNsFiles, we are making a new database - massert(10343, - "bad mmapv1GlobalOptions.lenForNewNsFiles", - mmapv1GlobalOptions.lenForNewNsFiles >= 1024 * 1024); - - maybeMkdir(); - - unsigned long long l = mmapv1GlobalOptions.lenForNewNsFiles; - log() << "allocating new ns file " << pathString << ", filling with zeroes..." << endl; - - Timer timer; - { - // Due to SERVER-15369 we need to explicitly write zero-bytes to the NS file. - const unsigned long long kBlockSize = 1024 * 1024; - invariant(l % kBlockSize == 0); // ns files can only be multiples of 1MB - const std::vector<char> zeros(kBlockSize, 0); - - File file; - file.open(pathString.c_str()); - - massert(18825, str::stream() << "couldn't create file " << pathString, file.is_open()); - - for (fileofs ofs = 0; ofs < l && !file.bad(); ofs += kBlockSize) { - file.write(ofs, &zeros[0], kBlockSize); - } - - if (file.bad()) { - try { - boost::filesystem::remove(pathString); - } catch (const std::exception& e) { - StringBuilder ss; - ss << "error removing file: " << e.what(); - massert(18909, ss.str(), 0); - } - } else { - file.fsync(); - } - - massert(18826, str::stream() << "failure writing file " << pathString, !file.bad()); - } - - if (_f.create(opCtx, pathString, l)) { - // The writes done in this function must not be rolled back. This will leave the - // file empty, but available for future use. That is why we go directly to the - // global dur dirty list rather than going through the OperationContext. - getDur().createdFile(pathString, l); - - // Commit the journal and all changes to disk so that even if exceptions occur - // during subsequent initialization, we won't have uncommited changes during file - // close. - getDur().commitNow(opCtx); - - len = l; - invariant(len == mmapv1GlobalOptions.lenForNewNsFiles); - - p = _f.getView(); - } - - log() << "done allocating ns file " << pathString << ", " - << "size: " << (len / 1024 / 1024) << "MB, " - << "took " << static_cast<double>(timer.millis()) / 1000.0 << " seconds"; - } - - invariant(p, str::stream() << "error couldn't open file " << pathString << " terminating"); - - invariant(len <= 0x7fffffff); - _ht.reset(new NamespaceHashTable(p, (int)len, "namespace index")); -} -} diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_index.h b/src/mongo/db/storage/mmap_v1/catalog/namespace_index.h deleted file mode 100644 index 5b7766b4035..00000000000 --- a/src/mongo/db/storage/mmap_v1/catalog/namespace_index.h +++ /dev/null @@ -1,100 +0,0 @@ -// namespace_index.h - -/** - * Copyright (C) 2013 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include <list> -#include <string> - -#include "mongo/base/disallow_copying.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace.h" -#include "mongo/db/storage/mmap_v1/diskloc.h" -#include "mongo/db/storage/mmap_v1/durable_mapped_file.h" - -namespace mongo { - -class NamespaceDetails; -class NamespaceHashTable; -class OperationContext; - -/* NamespaceIndex is the ".ns" file you see in the data directory. It is the "system catalog" - if you will: at least the core parts. (Additional info in system.* collections.) -*/ -class NamespaceIndex { - MONGO_DISALLOW_COPYING(NamespaceIndex); - -public: - NamespaceIndex(OperationContext* opCtx, const std::string& dir, const std::string& database); - ~NamespaceIndex(); - - /** - * Must be called before destruction. - */ - void close(OperationContext* opCtx) { - LockMongoFilesExclusive lock(opCtx); - _f.close(opCtx); - } - - /* returns true if the file represented by this file exists on disk */ - bool pathExists() const; - - void init(OperationContext* opCtx); - - void add_ns(OperationContext* opCtx, StringData ns, const DiskLoc& loc, bool capped); - void add_ns(OperationContext* opCtx, StringData ns, const NamespaceDetails* details); - void add_ns(OperationContext* opCtx, const Namespace& ns, const NamespaceDetails* details); - - NamespaceDetails* details(StringData ns) const; - NamespaceDetails* details(const Namespace& ns) const; - - void kill_ns(OperationContext* opCtx, StringData ns); - - bool allocated() const { - return _ht.get() != 0; - } - - void getCollectionNamespaces(std::list<std::string>* tofill) const; - - boost::filesystem::path path() const; - - unsigned long long fileLength() const { - return _f.length(); - } - -private: - void maybeMkdir() const; - - const std::string _dir; - const std::string _database; - - DurableMappedFile _f; - std::unique_ptr<NamespaceHashTable> _ht; -}; -} diff --git a/src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp b/src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp deleted file mode 100644 index 85cd79be43b..00000000000 --- a/src/mongo/db/storage/mmap_v1/catalog/namespace_test.cpp +++ /dev/null @@ -1,68 +0,0 @@ -// namespace_test.h - -/** -* Copyright (C) 2008 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#include "mongo/unittest/unittest.h" - -#include "mongo/db/storage/mmap_v1/catalog/namespace.h" - -namespace mongo { - -using std::string; - -TEST(NamespaceTest, Basics) { - Namespace foo("foo.bar"); - Namespace bar("bar.foo"); - - ASSERT_EQUALS(foo.toString(), foo.toString()); - ASSERT_EQUALS(foo.hash(), foo.hash()); - - ASSERT_NOT_EQUALS(foo.hash(), bar.hash()); - - ASSERT(foo == foo); - ASSERT(!(foo != foo)); - ASSERT(foo != bar); - ASSERT(!(foo == bar)); -} - -TEST(NamespaceTest, ExtraName) { - Namespace foo("foo.bar"); - ASSERT_FALSE(foo.isExtra()); - - string str0 = foo.extraName(0); - ASSERT_EQUALS("foo.bar$extra", str0); - Namespace ex0(str0); - ASSERT_TRUE(ex0.isExtra()); - - string str1 = foo.extraName(1); - ASSERT_EQUALS("foo.bar$extrb", str1); - Namespace ex1(str1); - ASSERT_TRUE(ex1.isExtra()); -} -} diff --git a/src/mongo/db/storage/mmap_v1/commit_notifier.cpp b/src/mongo/db/storage/mmap_v1/commit_notifier.cpp deleted file mode 100644 index 697c2dd1cdc..00000000000 --- a/src/mongo/db/storage/mmap_v1/commit_notifier.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/** - * Copyright (C) 2016 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects - * for all of the code used other than as permitted herein. If you modify - * file(s) with this exception, you may extend this exception to your - * version of the file(s), but you are not obligated to do so. If you do not - * wish to do so, delete this exception statement from your version. If you - * delete this exception statement from all source files in the program, - * then also delete it in the license file. - */ - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/commit_notifier.h" - -#include "mongo/util/assert_util.h" - -namespace mongo { - -CommitNotifier::CommitNotifier() = default; - -CommitNotifier::~CommitNotifier() { - invariant(!_nWaiting); -} - -CommitNotifier::When CommitNotifier::now() { - stdx::lock_guard<stdx::mutex> lock(_mutex); - return ++_lastReturned; -} - -void CommitNotifier::waitFor(When e) { - stdx::unique_lock<stdx::mutex> lock(_mutex); - ++_nWaiting; - while (_lastDone < e) { - _condition.wait(lock); - } -} - -void CommitNotifier::awaitBeyondNow() { - stdx::unique_lock<stdx::mutex> lock(_mutex); - ++_nWaiting; - When e = ++_lastReturned; - while (_lastDone <= e) { - _condition.wait(lock); - } -} - -void CommitNotifier::notifyAll(When e) { - stdx::unique_lock<stdx::mutex> lock(_mutex); - _lastDone = e; - _nWaiting = 0; - _condition.notify_all(); -} - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/commit_notifier.h b/src/mongo/db/storage/mmap_v1/commit_notifier.h deleted file mode 100644 index bbb40a14576..00000000000 --- a/src/mongo/db/storage/mmap_v1/commit_notifier.h +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Copyright (C) 2016 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects - * for all of the code used other than as permitted herein. If you modify - * file(s) with this exception, you may extend this exception to your - * version of the file(s), but you are not obligated to do so. If you do not - * wish to do so, delete this exception statement from your version. If you - * delete this exception statement from all source files in the program, - * then also delete it in the license file. - */ - -#pragma once - -#include "mongo/base/disallow_copying.h" -#include "mongo/stdx/condition_variable.h" -#include "mongo/stdx/mutex.h" - -namespace mongo { - -/** - * Establishes a synchronization point between threads. N threads are waits and one is notifier. - */ -class CommitNotifier { - MONGO_DISALLOW_COPYING(CommitNotifier); - -public: - typedef unsigned long long When; - - CommitNotifier(); - ~CommitNotifier(); - - When now(); - - /** - * Awaits the next notifyAll() call by another thread. notifications that precede this call are - * ignored -- we are looking for a fresh event. - */ - void waitFor(When e); - - /** - * A bit faster than waitFor(now()). - */ - void awaitBeyondNow(); - - /** - * May be called multiple times. Notifies all waiters. - */ - void notifyAll(When e); - - /** - * Returns how many threads are blocked in the waitFor/awaitBeyondNow calls. - */ - unsigned nWaiting() const { - return _nWaiting; - } - -private: - stdx::mutex _mutex; - stdx::condition_variable _condition; - - When _lastDone{0}; - When _lastReturned{0}; - unsigned _nWaiting{0}; -}; - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/compress.cpp b/src/mongo/db/storage/mmap_v1/compress.cpp deleted file mode 100644 index 8f8dce527ed..00000000000 --- a/src/mongo/db/storage/mmap_v1/compress.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// @file compress.cpp - -/** -* Copyright (C) 2012 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects -* for all of the code used other than as permitted herein. If you modify -* file(s) with this exception, you may extend this exception to your -* version of the file(s), but you are not obligated to do so. If you do not -* wish to do so, delete this exception statement from your version. If you -* delete this exception statement from all source files in the program, -* then also delete it in the license file. -*/ - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/compress.h" - -#include <snappy.h> - -namespace mongo { - -void rawCompress(const char* input, - size_t input_length, - char* compressed, - size_t* compressed_length) { - snappy::RawCompress(input, input_length, compressed, compressed_length); -} - -size_t maxCompressedLength(size_t source_len) { - return snappy::MaxCompressedLength(source_len); -} - -size_t compress(const char* input, size_t input_length, std::string* output) { - return snappy::Compress(input, input_length, output); -} - -bool uncompress(const char* compressed, size_t compressed_length, std::string* uncompressed) { - return snappy::Uncompress(compressed, compressed_length, uncompressed); -} -} diff --git a/src/mongo/db/storage/mmap_v1/compress.h b/src/mongo/db/storage/mmap_v1/compress.h deleted file mode 100644 index 8ff828a93a6..00000000000 --- a/src/mongo/db/storage/mmap_v1/compress.h +++ /dev/null @@ -1,46 +0,0 @@ -// @file compress.h - -/** -* Copyright (C) 2012 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects -* for all of the code used other than as permitted herein. If you modify -* file(s) with this exception, you may extend this exception to your -* version of the file(s), but you are not obligated to do so. If you do not -* wish to do so, delete this exception statement from your version. If you -* delete this exception statement from all source files in the program, -* then also delete it in the license file. -*/ - -#pragma once - -#include <string> - -namespace mongo { - -size_t compress(const char* input, size_t input_length, std::string* output); - -bool uncompress(const char* compressed, size_t compressed_length, std::string* uncompressed); - -size_t maxCompressedLength(size_t source_len); -void rawCompress(const char* input, - size_t input_length, - char* compressed, - size_t* compressed_length); -} diff --git a/src/mongo/db/storage/mmap_v1/data_file.cpp b/src/mongo/db/storage/mmap_v1/data_file.cpp deleted file mode 100644 index 46af46c0a47..00000000000 --- a/src/mongo/db/storage/mmap_v1/data_file.cpp +++ /dev/null @@ -1,253 +0,0 @@ -// data_file.cpp - -/** -* Copyright (C) 2013 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/data_file.h" - -#include <boost/filesystem/operations.hpp> -#include <utility> -#include <vector> - -#include "mongo/base/static_assert.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/storage/mmap_v1/dur.h" -#include "mongo/db/storage/mmap_v1/durable_mapped_file.h" -#include "mongo/db/storage/mmap_v1/file_allocator.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" -#include "mongo/util/log.h" - -namespace mongo { - -using std::endl; - -namespace { - -void data_file_check(void* _mb) { - if (sizeof(char*) == 4) { - uassert(10084, - "can't map file memory - mongo requires 64 bit build for larger datasets", - _mb != NULL); - } else { - uassert(10085, "can't map file memory", _mb != NULL); - } -} - -} // namespace - - -MONGO_STATIC_ASSERT(DataFileHeader::HeaderSize == 8192); -MONGO_STATIC_ASSERT(sizeof(static_cast<DataFileHeader*>(NULL)->data) == 4); -MONGO_STATIC_ASSERT(sizeof(DataFileHeader) - sizeof(static_cast<DataFileHeader*>(NULL)->data) == - DataFileHeader::HeaderSize); - - -int DataFile::maxSize() { - if (sizeof(int*) == 4) { - return 512 * 1024 * 1024; - } else if (mmapv1GlobalOptions.smallfiles) { - return 0x7ff00000 >> 2; - } else { - return 0x7ff00000; - } -} - -NOINLINE_DECL void DataFile::badOfs(int ofs) const { - msgasserted(13440, - str::stream() << "bad offset:" << ofs << " accessing file: " << mmf.filename() - << ". See http://dochub.mongodb.org/core/data-recovery"); -} - -int DataFile::_defaultSize() const { - int size; - - if (_fileNo <= 4) { - size = (64 * 1024 * 1024) << _fileNo; - } else { - size = 0x7ff00000; - } - - if (mmapv1GlobalOptions.smallfiles) { - size = size >> 2; - } - - return size; -} - -/** @return true if found and opened. if uninitialized (prealloc only) does not open. */ -Status DataFile::openExisting(OperationContext* opCtx, const char* filename) { - invariant(_mb == 0); - - if (!boost::filesystem::exists(filename)) { - return Status(ErrorCodes::InvalidPath, "DataFile::openExisting - file does not exist"); - } - - if (!mmf.open(opCtx, filename)) { - return Status(ErrorCodes::InternalError, "DataFile::openExisting - mmf.open failed"); - } - - // The mapped view of the file should never be NULL if the open call above succeeded. - _mb = mmf.getView(); - invariant(_mb); - - const uint64_t sz = mmf.length(); - invariant(sz <= 0x7fffffff); - invariant(sz % 4096 == 0); - - if (sz < 64 * 1024 * 1024 && !mmapv1GlobalOptions.smallfiles) { - if (sz >= 16 * 1024 * 1024 && sz % (1024 * 1024) == 0) { - log() << "info openExisting file size " << sz - << " but mmapv1GlobalOptions.smallfiles=false: " << filename << endl; - } else { - log() << "openExisting size " << sz << " less than minimum file size expectation " - << filename << endl; - verify(false); - } - } - - data_file_check(_mb); - return Status::OK(); -} - -void DataFile::open(OperationContext* opCtx, - const char* filename, - int minSize, - bool preallocateOnly) { - long size = _defaultSize(); - - while (size < minSize) { - if (size < maxSize() / 2) { - size *= 2; - } else { - size = maxSize(); - break; - } - } - - if (size > maxSize()) { - size = maxSize(); - } - - invariant(size >= 64 * 1024 * 1024 || mmapv1GlobalOptions.smallfiles); - invariant(size % 4096 == 0); - - if (preallocateOnly) { - if (mmapv1GlobalOptions.prealloc) { - FileAllocator::get()->requestAllocation(filename, size); - } - return; - } - - { - invariant(_mb == 0); - unsigned long long sz = size; - if (mmf.create(opCtx, filename, sz)) { - _mb = mmf.getView(); - } - - invariant(sz <= 0x7fffffff); - size = (int)sz; - } - - data_file_check(_mb); - header()->init(opCtx, _fileNo, size, filename); -} - -void DataFile::flush(bool sync) { - mmf.flush(sync); -} - -DiskLoc DataFile::allocExtentArea(OperationContext* opCtx, int size) { - // The header would be NULL if file open failed. However, if file open failed we should - // never be entering here. - invariant(header()); - invariant(size <= header()->unusedLength); - - int offset = header()->unused.getOfs(); - - DataFileHeader* h = header(); - *opCtx->recoveryUnit()->writing(&h->unused) = DiskLoc(_fileNo, offset + size); - opCtx->recoveryUnit()->writingInt(h->unusedLength) = h->unusedLength - size; - - return DiskLoc(_fileNo, offset); -} - -// ------------------------------------------------------------------------------- - -void DataFileHeader::init(OperationContext* opCtx, - int fileno, - int filelength, - const char* filename) { - if (uninitialized()) { - DEV log() << "datafileheader::init initializing " << filename << " n:" << fileno << endl; - - massert(13640, - str::stream() << "DataFileHeader looks corrupt at file open filelength:" - << filelength - << " fileno:" - << fileno, - filelength > 32768); - - // The writes done in this function must not be rolled back. If the containing - // UnitOfWork rolls back it should roll back to the state *after* these writes. This - // will leave the file empty, but available for future use. That is why we go directly - // to the global dur dirty list rather than going through the RecoveryUnit. - getDur().createdFile(filename, filelength); - - typedef std::pair<void*, unsigned> Intent; - std::vector<Intent> intent; - intent.push_back(std::make_pair(this, sizeof(DataFileHeader))); - privateViews.makeWritable(this, sizeof(DataFileHeader)); - getDur().declareWriteIntents(intent); - - fileLength = filelength; - version = DataFileVersion::defaultForNewFiles(); - unused.set(fileno, HeaderSize); - unusedLength = fileLength - HeaderSize - 16; - freeListStart.Null(); - freeListEnd.Null(); - } else { - checkUpgrade(opCtx); - } -} - -void DataFileHeader::checkUpgrade(OperationContext* opCtx) { - if (freeListStart == DiskLoc(0, 0)) { - // we are upgrading from 2.4 to 2.6 - invariant(freeListEnd == DiskLoc(0, 0)); // both start and end should be (0,0) or real - WriteUnitOfWork wunit(opCtx); - *opCtx->recoveryUnit()->writing(&freeListStart) = DiskLoc(); - *opCtx->recoveryUnit()->writing(&freeListEnd) = DiskLoc(); - wunit.commit(); - } -} -} diff --git a/src/mongo/db/storage/mmap_v1/data_file.h b/src/mongo/db/storage/mmap_v1/data_file.h deleted file mode 100644 index 60dc095791e..00000000000 --- a/src/mongo/db/storage/mmap_v1/data_file.h +++ /dev/null @@ -1,264 +0,0 @@ -// data_file.h - -/** -* Copyright (C) 2013 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include "mongo/bson/util/builder.h" -#include "mongo/db/storage/mmap_v1/diskloc.h" -#include "mongo/db/storage/mmap_v1/durable_mapped_file.h" -#include "mongo/platform/bits.h" - -namespace mongo { - -class OperationContext; - -#pragma pack(1) -class DataFileVersion { -public: - DataFileVersion(uint32_t major, uint32_t minor) : _major(major), _minor(minor) {} - - static DataFileVersion defaultForNewFiles() { - return DataFileVersion(kCurrentMajor, kIndexes24AndNewer | kMayHave30Freelist); - } - - Status isCompatibleWithCurrentCode() const { - if (_major != kCurrentMajor) { - StringBuilder sb; - sb << "The data files have major version " << _major - << ", but this version of mongod only supports version " << kCurrentMajor; - return {ErrorCodes::MustUpgrade, sb.str()}; - } - - uint32_t unrecognizedMinorBits = _minor & ~kUsedMinorFlagsMask; - if (unrecognizedMinorBits) { - StringBuilder sb; - sb << "The data files use features not recognized by this version of mongod; the" - " feature bits in positions [ "; - bool firstIteration = true; - while (unrecognizedMinorBits) { - const int lowestSetBitPosition = countTrailingZeros64(unrecognizedMinorBits); - if (!firstIteration) { - sb << ", "; - } - sb << lowestSetBitPosition; - unrecognizedMinorBits ^= (1 << lowestSetBitPosition); - firstIteration = false; - } - sb << " ] aren't recognized by this version of mongod"; - - return {ErrorCodes::MustUpgrade, sb.str()}; - } - - const uint32_t indexCleanliness = _minor & kIndexPluginMask; - if (indexCleanliness != kIndexes24AndNewer && indexCleanliness != kIndexes22AndOlder) { - StringBuilder sb; - sb << "The data files have index plugin version " << indexCleanliness - << ", but this version of mongod only supports versions " << kIndexes22AndOlder - << " and " << kIndexes24AndNewer; - return {ErrorCodes::MustUpgrade, sb.str()}; - } - - // We are compatible with either setting of kMayHave30Freelist. - - return Status::OK(); - } - - bool is24IndexClean() const { - return (_minor & kIndexPluginMask) == kIndexes24AndNewer; - } - void setIs24IndexClean() { - _minor = ((_minor & ~kIndexPluginMask) | kIndexes24AndNewer); - } - - bool mayHave30Freelist() const { - return _minor & kMayHave30Freelist; - } - void setMayHave30Freelist() { - _minor |= kMayHave30Freelist; - } - - bool getMayHaveCollationMetadata() const { - return _minor & kMayHaveCollationMetadata; - } - void setMayHaveCollationMetadata() { - _minor |= kMayHaveCollationMetadata; - } - - uint32_t majorRaw() const { - return _major; - } - uint32_t minorRaw() const { - return _minor; - } - -private: - static const uint32_t kCurrentMajor = 4; - - // minor layout: - // first 4 bits - index plugin cleanliness. - // see IndexCatalog::_upgradeDatabaseMinorVersionIfNeeded for details - // 5th bit - 1 if started with 3.0-style freelist implementation (SERVER-14081) - // 6th bit - 1 if indexes or collections with a collation have been created. - // 7th through 31st bit - reserved and must be set to 0. - static const uint32_t kIndexPluginMask = 0xf; - static const uint32_t kIndexes22AndOlder = 5; - static const uint32_t kIndexes24AndNewer = 6; - - static const uint32_t kMayHave30Freelist = (1 << 4); - - static const uint32_t kMayHaveCollationMetadata = (1 << 5); - - // All set bits we know about are covered by this mask. - static const uint32_t kUsedMinorFlagsMask = - kIndexPluginMask | kMayHave30Freelist | kMayHaveCollationMetadata; - - uint32_t _major; - uint32_t _minor; -}; - -// Note: Intentionally not defining relational operators for DataFileVersion as there is no -// total ordering of all versions now that '_minor' is used as a bit vector. -#pragma pack() - -/* a datafile - i.e. the "dbname.<#>" files : - - ---------------------- - DataFileHeader - ---------------------- - Extent (for a particular namespace) - MmapV1RecordHeader - ... - MmapV1RecordHeader (some chained for unused space) - ---------------------- - more Extents... - ---------------------- -*/ -#pragma pack(1) -class DataFileHeader { -public: - DataFileVersion version; - int fileLength; - /** - * unused is the portion of the file that doesn't belong to any allocated extents. -1 = no more - */ - DiskLoc unused; - int unusedLength; - DiskLoc freeListStart; - DiskLoc freeListEnd; - char reserved[8192 - 4 * 4 - 8 * 3]; - - char data[4]; // first extent starts here - - enum { HeaderSize = 8192 }; - - bool uninitialized() const { - return version.majorRaw() == 0; - } - - void init(OperationContext* opCtx, int fileno, int filelength, const char* filename); - - void checkUpgrade(OperationContext* opCtx); - - bool isEmpty() const { - return uninitialized() || (unusedLength == fileLength - HeaderSize - 16); - } -}; -#pragma pack() - - -class DataFile { -public: - DataFile(OperationContext* opCtx, int fn) : _fileNo(fn), mmf(opCtx), _mb(NULL) {} - - /** @return true if found and opened. if uninitialized (prealloc only) does not open. */ - Status openExisting(OperationContext* opCtx, const char* filename); - - /** creates if DNE */ - void open(OperationContext* opCtx, - const char* filename, - int requestedDataSize = 0, - bool preallocateOnly = false); - - /** - * Must be called before destruction. - */ - void close(OperationContext* opCtx) { - LockMongoFilesExclusive lock(opCtx); - mmf.close(opCtx); - } - - DiskLoc allocExtentArea(OperationContext* opCtx, int size); - - DataFileHeader* getHeader() { - return header(); - } - const DataFileHeader* getHeader() const { - return header(); - } - - HANDLE getFd() { - return mmf.getFd(); - } - unsigned long long length() const { - return mmf.length(); - } - - /* return max size an extent may be */ - static int maxSize(); - - /** fsync */ - void flush(bool sync); - -private: - friend class MmapV1ExtentManager; - - - void badOfs(int) const; - int _defaultSize() const; - - void grow(DiskLoc dl, int size); - - char* p() const { - return (char*)_mb; - } - DataFileHeader* header() { - return static_cast<DataFileHeader*>(_mb); - } - const DataFileHeader* header() const { - return static_cast<DataFileHeader*>(_mb); - } - - - const int _fileNo; - - DurableMappedFile mmf; - void* _mb; // the memory mapped view -}; -} diff --git a/src/mongo/db/storage/mmap_v1/data_file_sync.cpp b/src/mongo/db/storage/mmap_v1/data_file_sync.cpp deleted file mode 100644 index 975b1c3413e..00000000000 --- a/src/mongo/db/storage/mmap_v1/data_file_sync.cpp +++ /dev/null @@ -1,137 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/data_file_sync.h" - -#include "mongo/db/client.h" -#include "mongo/db/commands/server_status_metric.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/service_context.h" -#include "mongo/db/storage/mmap_v1/dur_journal.h" -#include "mongo/db/storage/mmap_v1/mmap.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" -#include "mongo/db/storage/storage_options.h" -#include "mongo/util/exit.h" -#include "mongo/util/log.h" - -namespace mongo { - -using std::endl; - -DataFileSync dataFileSync; - -DataFileSync::DataFileSync() - : ServerStatusSection("backgroundFlushing"), _total_time(0), _flushes(0), _last() {} - -void DataFileSync::run() { - Client::initThread(name().c_str()); - - if (storageGlobalParams.syncdelay == 0) { - log() << "warning: --syncdelay 0 is not recommended and can have strange performance" - << endl; - } else if (storageGlobalParams.syncdelay == 1) { - log() << "--syncdelay 1" << endl; - } else if (storageGlobalParams.syncdelay != 60) { - LOG(1) << "--syncdelay " << storageGlobalParams.syncdelay.load() << endl; - } - int time_flushing = 0; - while (!globalInShutdownDeprecated()) { - if (storageGlobalParams.syncdelay == 0) { - // in case at some point we add an option to change at runtime - sleepsecs(5); - continue; - } - - sleepmillis( - (long long)std::max(0.0, (storageGlobalParams.syncdelay * 1000) - time_flushing)); - - if (globalInShutdownDeprecated()) { - // occasional issue trying to flush during shutdown when sleep interrupted - break; - } - - auto opCtx = cc().makeOperationContext(); - Date_t start = jsTime(); - StorageEngine* storageEngine = getGlobalServiceContext()->getStorageEngine(); - - dur::notifyPreDataFileFlush(); - int numFiles = storageEngine->flushAllFiles(opCtx.get(), true); - dur::notifyPostDataFileFlush(); - - time_flushing = durationCount<Milliseconds>(jsTime() - start); - - _flushed(time_flushing); - - if (shouldLog(logger::LogSeverity::Debug(1)) || time_flushing >= 10000) { - log() << "flushing mmaps took " << time_flushing << "ms " - << " for " << numFiles << " files" << endl; - } - } -} - -BSONObj DataFileSync::generateSection(OperationContext* opCtx, - const BSONElement& configElement) const { - if (!running()) { - return BSONObj(); - } - - BSONObjBuilder b; - b.appendNumber("flushes", _flushes); - b.appendNumber("total_ms", _total_time); - b.appendNumber("average_ms", (_flushes ? (_total_time / double(_flushes)) : 0.0)); - b.appendNumber("last_ms", _last_time); - b.append("last_finished", _last); - return b.obj(); -} - -void DataFileSync::_flushed(int ms) { - _flushes++; - _total_time += ms; - _last_time = ms; - _last = jsTime(); -} - - -class MemJournalServerStatusMetric : public ServerStatusMetric { -public: - MemJournalServerStatusMetric() : ServerStatusMetric(".mem.mapped") {} - virtual void appendAtLeaf(BSONObjBuilder& b) const { - int m = MemoryMappedFile::totalMappedLengthInMB(); - b.appendNumber("mapped", m); - - if (storageGlobalParams.dur) { - m *= 2; - b.appendNumber("mappedWithJournal", m); - } - } -} memJournalServerStatusMetric; -} diff --git a/src/mongo/db/storage/mmap_v1/data_file_sync.h b/src/mongo/db/storage/mmap_v1/data_file_sync.h deleted file mode 100644 index a26624f2c41..00000000000 --- a/src/mongo/db/storage/mmap_v1/data_file_sync.h +++ /dev/null @@ -1,65 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/commands/server_status.h" -#include "mongo/util/background.h" - -namespace mongo { - -/** - * does background async flushes of mmapped files - */ -class DataFileSync : public BackgroundJob, public ServerStatusSection { -public: - DataFileSync(); - - virtual bool includeByDefault() const { - return true; - } - virtual std::string name() const { - return "DataFileSync"; - } - - void run(); - - virtual BSONObj generateSection(OperationContext* opCtx, - const BSONElement& configElement) const; - -private: - void _flushed(int ms); - - long long _total_time; - long long _flushes; - int _last_time; - Date_t _last; -}; - -extern DataFileSync dataFileSync; -} diff --git a/src/mongo/db/storage/mmap_v1/data_file_version_test.cpp b/src/mongo/db/storage/mmap_v1/data_file_version_test.cpp deleted file mode 100644 index 40627007a19..00000000000 --- a/src/mongo/db/storage/mmap_v1/data_file_version_test.cpp +++ /dev/null @@ -1,123 +0,0 @@ -/** - * Copyright (C) 2016 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/data_file.h" - -#include "mongo/unittest/unittest.h" - -namespace mongo { -namespace { - -TEST(DataFileVersionTest, DefaultForNewFilesIsCompatibleWithCurrentCode) { - auto version = DataFileVersion::defaultForNewFiles(); - ASSERT_OK(version.isCompatibleWithCurrentCode()); -} - -TEST(DataFileVersionTest, CanSetIs24IndexClean) { - const uint32_t major = 4; - const uint32_t minor = 5; - DataFileVersion version(major, minor); - ASSERT_OK(version.isCompatibleWithCurrentCode()); - - ASSERT_FALSE(version.is24IndexClean()); - version.setIs24IndexClean(); - ASSERT_TRUE(version.is24IndexClean()); -} - -TEST(DataFileVersionTest, CanSetMayHave30Freelist) { - const uint32_t major = 4; - const uint32_t minor = 5; - DataFileVersion version(major, minor); - ASSERT_OK(version.isCompatibleWithCurrentCode()); - - ASSERT_FALSE(version.mayHave30Freelist()); - version.setMayHave30Freelist(); - ASSERT_TRUE(version.mayHave30Freelist()); -} - -TEST(DataFileVersionTest, CanSetMayHaveCollationMetadata) { - auto version = DataFileVersion::defaultForNewFiles(); - ASSERT_OK(version.isCompatibleWithCurrentCode()); - - ASSERT_FALSE(version.getMayHaveCollationMetadata()); - version.setMayHaveCollationMetadata(); - ASSERT_TRUE(version.getMayHaveCollationMetadata()); - ASSERT_OK(version.isCompatibleWithCurrentCode()); -} - -TEST(DataFileVersionTest, MustUpgradeWhenMajorVersionIsUnsupported) { - const uint32_t major = 5; - const uint32_t minor = 6; - DataFileVersion version(major, minor); - auto status = version.isCompatibleWithCurrentCode(); - ASSERT_EQ(ErrorCodes::MustUpgrade, status.code()); - ASSERT_EQ( - "The data files have major version 5, but this version of mongod only supports version 4", - status.reason()); -} - -TEST(DataFileVersionTest, MustUpgradeWhenSingleMinorFeatureBitIsUnrecognized) { - const uint32_t major = 4; - const uint32_t minor = 6 | (1 << 10); - DataFileVersion version(major, minor); - auto status = version.isCompatibleWithCurrentCode(); - ASSERT_EQ(ErrorCodes::MustUpgrade, status.code()); - ASSERT_EQ( - "The data files use features not recognized by this version of mongod; the feature bits in" - " positions [ 10 ] aren't recognized by this version of mongod", - status.reason()); -} - -TEST(DataFileVersionTest, MustUpgradeWhenMultipleMinorFeatureBitsAreUnrecognized) { - const uint32_t major = 4; - const uint32_t minor = 6 | (1 << 10) | (1 << 14) | (1 << 15); - DataFileVersion version(major, minor); - auto status = version.isCompatibleWithCurrentCode(); - ASSERT_EQ(ErrorCodes::MustUpgrade, status.code()); - ASSERT_EQ( - "The data files use features not recognized by this version of mongod; the feature bits in" - " positions [ 10, 14, 15 ] aren't recognized by this version of mongod", - status.reason()); -} - -TEST(DataFileVersionTest, MustUpgradeWhenIndexPluginVersionIsUnsupported) { - const uint32_t major = 4; - const uint32_t minor = 7; - DataFileVersion version(major, minor); - auto status = version.isCompatibleWithCurrentCode(); - ASSERT_EQ(ErrorCodes::MustUpgrade, status.code()); - ASSERT_EQ( - "The data files have index plugin version 7, but this version of mongod only supports" - " versions 5 and 6", - status.reason()); -} - -} // namespace -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/diskloc.h b/src/mongo/db/storage/mmap_v1/diskloc.h deleted file mode 100644 index 24ff75c7609..00000000000 --- a/src/mongo/db/storage/mmap_v1/diskloc.h +++ /dev/null @@ -1,222 +0,0 @@ -/** -* Copyright (C) 2008 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -/* @file diskloc.h - - Storage subsystem management. - Lays out our datafiles on disk, manages disk space. -*/ - -#pragma once - -#include <boost/functional/hash.hpp> -#include <cstdint> - -#include "mongo/db/jsobj.h" -#include "mongo/db/record_id.h" - -namespace mongo { - -template <class Version> -class BtreeBucket; - -#pragma pack(1) -/** represents a disk location/offset on disk in a database. 64 bits. - it is assumed these will be passed around by value a lot so don't do anything to make them large - (such as adding a virtual function) - */ -class DiskLoc { - // this will be volume, file #, etc. but is a logical value could be anything depending on - // storage engine - int _a; - int ofs; - -public: - enum SentinelValues { - /* note NullOfs is different. todo clean up. see refs to NullOfs in code - use is valid but - * outside DiskLoc context so confusing as-is. */ - NullOfs = -1, - - // Caps the number of files that may be allocated in a database, allowing about 32TB of - // data per db. Note that the DiskLoc and DiskLoc56Bit types supports more files than - // this value, as does the data storage format. - MaxFiles = 16000, - - // How invalid DiskLocs are represented in RecordIds. - InvalidRepr = -2LL, - }; - - DiskLoc(int a, int Ofs) : _a(a), ofs(Ofs) {} - DiskLoc() { - Null(); - } - - // Minimum allowed DiskLoc. No MmapV1RecordHeader may begin at this location because file and - // extent headers must precede Records in a file. - static DiskLoc min() { - return DiskLoc(0, 0); - } - - // Maximum allowed DiskLoc. - // No MmapV1RecordHeader may begin at this location because the minimum size of a - // MmapV1RecordHeader is larger than one byte. Also, the last bit is not able to be used - // because mmapv1 uses that for "used". - static DiskLoc max() { - return DiskLoc(0x7fffffff, 0x7ffffffe); - } - - bool questionable() const { - return ofs < -1 || _a < -1 || _a > 524288; - } - - bool isNull() const { - return _a == -1; - } - DiskLoc& Null() { - _a = -1; - /* note NullOfs is different. todo clean up. see refs to NullOfs in code - use is valid but - * outside DiskLoc context so confusing as-is. */ - ofs = 0; - return *this; - } - void assertOk() const { - verify(!isNull()); - } - DiskLoc& setInvalid() { - _a = -2; - ofs = 0; - return *this; - } - bool isValid() const { - return _a != -2; - } - - std::string toString() const { - if (isNull()) - return "null"; - std::stringstream ss; - ss << _a << ':' << std::hex << ofs; - return ss.str(); - } - - BSONObj toBSONObj() const { - return BSON("file" << _a << "offset" << ofs); - } - - int a() const { - return _a; - } - - int& GETOFS() { - return ofs; - } - int getOfs() const { - return ofs; - } - void set(int a, int b) { - _a = a; - ofs = b; - } - - void inc(int amt) { - verify(!isNull()); - ofs += amt; - } - - bool sameFile(DiskLoc b) { - return _a == b._a; - } - - bool operator==(const DiskLoc& b) const { - return _a == b._a && ofs == b.ofs; - } - bool operator!=(const DiskLoc& b) const { - return !(*this == b); - } - int compare(const DiskLoc& b) const { - int x = _a - b._a; - if (x) - return x; - return ofs - b.ofs; - } - - static DiskLoc fromRecordId(RecordId id) { - if (id.isNormal()) - return DiskLoc((id.repr() >> 32), uint32_t(id.repr())); - - if (id.isNull()) - return DiskLoc(); - - if (id == RecordId::max()) - return DiskLoc::max(); - - if (id == RecordId::min()) - return DiskLoc::min(); - - dassert(id.repr() == InvalidRepr); - return DiskLoc().setInvalid(); - } - - RecordId toRecordId() const { - if (_a >= 0) { - if (*this == DiskLoc::min()) - return RecordId::min(); - - if (*this == DiskLoc::max()) - return RecordId::max(); - - return RecordId(uint64_t(_a) << 32 | uint32_t(ofs)); - } - - if (isNull()) - return RecordId(); - - dassert(!isValid()); - return RecordId(InvalidRepr); - } -}; -#pragma pack() - -inline bool operator<(const DiskLoc& rhs, const DiskLoc& lhs) { - return rhs.compare(lhs) < 0; -} -inline bool operator<=(const DiskLoc& rhs, const DiskLoc& lhs) { - return rhs.compare(lhs) <= 0; -} -inline bool operator>(const DiskLoc& rhs, const DiskLoc& lhs) { - return rhs.compare(lhs) > 0; -} -inline bool operator>=(const DiskLoc& rhs, const DiskLoc& lhs) { - return rhs.compare(lhs) >= 0; -} - -inline std::ostream& operator<<(std::ostream& stream, const DiskLoc& loc) { - return stream << loc.toString(); -} - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/dur.cpp b/src/mongo/db/storage/mmap_v1/dur.cpp deleted file mode 100644 index 835f4302647..00000000000 --- a/src/mongo/db/storage/mmap_v1/dur.cpp +++ /dev/null @@ -1,917 +0,0 @@ -/** - * Copyright (C) 2009-2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -/* - phases: - - PREPLOGBUFFER - we will build an output buffer ourself and then use O_DIRECT - we could be in read lock for this - for very large objects write directly to redo log in situ? - WRITETOJOURNAL - we could be unlocked (the main db lock that is...) for this, with sufficient care, but there - is some complexity have to handle falling behind which would use too much ram (going back - into a read lock would suffice to stop that). for now (1.7.5/1.8.0) we are in read lock which - is not ideal. - WRITETODATAFILES - actually write to the database data files in this phase. currently done by memcpy'ing the - writes back to the non-private MMF. alternatively one could write to the files the - traditional way; however the way our storage engine works that isn't any faster (actually - measured a tiny bit slower). - REMAPPRIVATEVIEW - we could in a write lock quickly flip readers back to the main view, then stay in read lock - and do our real remapping. with many files (e.g., 1000), remapping could be time consuming - (several ms), so we don't want to be too frequent. there could be a slow down immediately - after remapping as fresh copy-on-writes for commonly written pages will - be required. so doing these remaps fractionally is helpful. - - mutexes: - - READLOCK dbMutex (big 'R') - LOCK groupCommitMutex - PREPLOGBUFFER() - READLOCK mmmutex - commitJob.reset() - UNLOCK dbMutex // now other threads can write - WRITETOJOURNAL() - WRITETODATAFILES() - UNLOCK mmmutex - UNLOCK groupCommitMutex - - every Nth groupCommit, at the end, we REMAPPRIVATEVIEW() at the end of the work. because of - that we are in W lock for that groupCommit, which is nonideal of course. - - @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc -*/ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kJournal - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/dur.h" - -#include <iomanip> -#include <utility> - -#include "mongo/base/static_assert.h" -#include "mongo/db/client.h" -#include "mongo/db/commands/server_status.h" -#include "mongo/db/concurrency/lock_state.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/storage/mmap_v1/aligned_builder.h" -#include "mongo/db/storage/mmap_v1/commit_notifier.h" -#include "mongo/db/storage/mmap_v1/dur_commitjob.h" -#include "mongo/db/storage/mmap_v1/dur_journal.h" -#include "mongo/db/storage/mmap_v1/dur_journal_writer.h" -#include "mongo/db/storage/mmap_v1/dur_recover.h" -#include "mongo/db/storage/mmap_v1/dur_stats.h" -#include "mongo/db/storage/mmap_v1/durable_mapped_file.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" -#include "mongo/db/storage/storage_options.h" -#include "mongo/stdx/condition_variable.h" -#include "mongo/stdx/mutex.h" -#include "mongo/stdx/thread.h" -#include "mongo/util/clock_source.h" -#include "mongo/util/exit.h" -#include "mongo/util/log.h" -#include "mongo/util/timer.h" - -namespace mongo { - -using std::endl; -using std::fixed; -using std::hex; -using std::set; -using std::setprecision; -using std::setw; -using std::string; -using std::stringstream; - -namespace dur { - -namespace { - -// Used to activate the flush thread -stdx::mutex flushMutex; -stdx::condition_variable flushRequested; - -// This is waited on for getlasterror acknowledgements. It means that data has been written to -// the journal, but not necessarily applied to the shared view, so it is all right to -// acknowledge the user operation, but NOT all right to delete the journal files for example. -CommitNotifier commitNotify; - -// This is waited on for complete flush. It means that data has been both written to journal -// and applied to the shared view, so it is allowed to delete the journal files. Used for -// fsync:true, close DB, shutdown acknowledgements. -CommitNotifier applyToDataFilesNotify; - -// When set, the flush thread will exit -AtomicUInt32 shutdownRequested(0); - -enum { - // How many commit cycles to do before considering doing a remap - NumCommitsBeforeRemap = 10, - - // How many outstanding journal flushes should be allowed before applying writer back - // pressure. Size of 1 allows two journal blocks to be in the process of being written - - // one on the journal writer's buffer and one blocked waiting to be picked up. - NumAsyncJournalWrites = 1, -}; - -// Remap loop state -unsigned remapFileToStartAt; - -// How frequently to reset the durability statistics -enum { DurStatsResetIntervalMillis = 3 * 1000 }; - -// Size sanity checks -MONGO_STATIC_ASSERT(UncommittedBytesLimit > BSONObjMaxInternalSize * 3); -MONGO_STATIC_ASSERT(sizeof(void*) == 4 || UncommittedBytesLimit > BSONObjMaxInternalSize * 6); - - -/** - * MMAP V1 durability server status section. - */ -class DurSSS : public ServerStatusSection { -public: - DurSSS() : ServerStatusSection("dur") {} - - virtual bool includeByDefault() const { - return true; - } - - virtual BSONObj generateSection(OperationContext* opCtx, - const BSONElement& configElement) const { - if (!getDur().isDurable()) { - return BSONObj(); - } - - return dur::stats.asObj(); - } - -} durSSS; - - -/** - * A no-op durability interface. Used for the case when journaling is not enabled. - */ -class NonDurableImpl : public DurableInterface { -public: - NonDurableImpl() {} - - // DurableInterface virtual methods - virtual void* writingPtr(void* x, unsigned len) { - return x; - } - virtual void declareWriteIntent(void*, unsigned) {} - virtual void declareWriteIntents(const std::vector<std::pair<void*, unsigned>>& intents) {} - virtual void createdFile(const std::string& filename, unsigned long long len) {} - virtual bool waitUntilDurable() { - return false; - } - virtual bool commitNow(OperationContext* opCtx) { - return false; - } - virtual bool commitIfNeeded() { - return false; - } - virtual void syncDataAndTruncateJournal(OperationContext* opCtx) {} - virtual bool isDurable() const { - return false; - } - virtual void closingFileNotification() {} - virtual void commitAndStopDurThread(OperationContext* opCtx) {} -}; - - -/** - * The actual durability interface, when journaling is enabled. - */ -class DurableImpl : public DurableInterface { -public: - DurableImpl() {} - - // DurableInterface virtual methods - virtual void declareWriteIntents(const std::vector<std::pair<void*, unsigned>>& intents); - virtual void createdFile(const std::string& filename, unsigned long long len); - virtual bool waitUntilDurable(); - virtual bool commitNow(OperationContext* opCtx); - virtual bool commitIfNeeded(); - virtual void syncDataAndTruncateJournal(OperationContext* opCtx); - virtual bool isDurable() const { - return true; - } - virtual void closingFileNotification(); - virtual void commitAndStopDurThread(OperationContext* opCtx); - - void start(ClockSource* cs, int64_t serverStartMs); - -private: - stdx::thread _durThreadHandle; -}; - - -/** - * Diagnostic to check that the private view and the non-private view are in sync after - * applying the journal changes. This function is very slow and only runs when paranoid checks - * are enabled. - * - * Must be called under at least S flush lock to ensure that there are no concurrent writes - * happening. - */ -void debugValidateFileMapsMatch(const DurableMappedFile* mmf) { - const unsigned char* p = (const unsigned char*)mmf->getView(); - const unsigned char* w = (const unsigned char*)mmf->view_write(); - - // Ignore pre-allocated files that are not fully created yet - if (!p || !w) { - return; - } - - if (memcmp(p, w, (unsigned)mmf->length()) == 0) { - return; - } - - unsigned low = 0xffffffff; - unsigned high = 0; - - log() << "DurParanoid mismatch in " << mmf->filename(); - - int logged = 0; - unsigned lastMismatch = 0xffffffff; - - for (unsigned i = 0; i < mmf->length(); i++) { - if (p[i] != w[i]) { - if (lastMismatch != 0xffffffff && lastMismatch + 1 != i) { - // Separate blocks of mismatches - log() << std::endl; - } - - lastMismatch = i; - - if (++logged < 60) { - if (logged == 1) { - // For .ns files to find offset in record - log() << "ofs % 628 = 0x" << hex << (i % 628) << endl; - } - - stringstream ss; - ss << "mismatch ofs:" << hex << i << "\tfilemap:" << setw(2) << (unsigned)w[i] - << "\tprivmap:" << setw(2) << (unsigned)p[i]; - - if (p[i] > 32 && p[i] <= 126) { - ss << '\t' << p[i]; - } - - log() << ss.str() << endl; - } - - if (logged == 60) { - log() << "..." << endl; - } - - if (i < low) - low = i; - if (i > high) - high = i; - } - } - - if (low != 0xffffffff) { - std::stringstream ss; - ss << "journal error warning views mismatch " << mmf->filename() << ' ' << hex << low - << ".." << high << " len:" << high - low + 1; - - log() << ss.str() << endl; - log() << "priv loc: " << (void*)(p + low) << ' ' << endl; - - severe() << "Written data does not match in-memory view. Missing WriteIntent?"; - MONGO_UNREACHABLE; - } -} - - -/** - * Main code of the remap private view function. - */ -void remapPrivateViewImpl(OperationContext* opCtx, double fraction) { - LOG(4) << "journal REMAPPRIVATEVIEW" << endl; - -// There is no way that the set of files can change while we are in this method, because -// we hold the flush lock in X mode. For files to go away, a database needs to be dropped, -// which means acquiring the flush lock in at least IX mode. -// -// However, the record fetcher logic unfortunately operates without any locks and on -// Windows and Solaris remap is not atomic and there is a window where the record fetcher -// might get an access violation. That's why we acquire the mongo files mutex here in X -// mode and the record fetcher takes in in S-mode (see MmapV1RecordFetcher for more -// detail). -// -// See SERVER-5723 for performance improvement. -// See SERVER-5680 to see why this code is necessary on Windows. -// See SERVER-8795 to see why this code is necessary on Solaris. -#if defined(_WIN32) || defined(__sun) - LockMongoFilesExclusive lk(opCtx); -#else - LockMongoFilesShared lk(opCtx); -#endif - - std::set<MongoFile*>& files = MongoFile::getAllFiles(); - - const unsigned sz = files.size(); - if (sz == 0) { - return; - } - - unsigned ntodo = (unsigned)(sz * fraction); - if (ntodo < 1) - ntodo = 1; - if (ntodo > sz) - ntodo = sz; - - const set<MongoFile*>::iterator b = files.begin(); - const set<MongoFile*>::iterator e = files.end(); - set<MongoFile*>::iterator i = b; - - // Skip to our starting position as remembered from the last remap cycle - for (unsigned x = 0; x < remapFileToStartAt; x++) { - i++; - if (i == e) - i = b; - } - - // Mark where to start on the next cycle - const unsigned startedAt = remapFileToStartAt; - remapFileToStartAt = (remapFileToStartAt + ntodo) % sz; - - Timer t; - - for (unsigned x = 0; x < ntodo; x++) { - if ((*i)->isDurableMappedFile()) { - DurableMappedFile* const mmf = (DurableMappedFile*)*i; - - // Sanity check that the contents of the shared and the private view match so we - // don't end up overwriting data. - if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalParanoid) { - debugValidateFileMapsMatch(mmf); - } - - if (mmf->willNeedRemap()) { - mmf->remapThePrivateView(opCtx); - } - - i++; - - if (i == e) - i = b; - } - } - - LOG(3) << "journal REMAPPRIVATEVIEW done startedAt: " << startedAt << " n:" << ntodo << ' ' - << t.millis() << "ms"; -} - - -// One instance of each durability interface -DurableImpl durableImpl; -NonDurableImpl nonDurableImpl; - -// Notified when we commit to the journal. -static JournalListener* journalListener = &NoOpJournalListener::instance; -// Protects journalListener. -static stdx::mutex journalListenerMutex; - -} // namespace - - -// Declared in dur_preplogbuffer.cpp -void PREPLOGBUFFER(JSectHeader& outHeader, - AlignedBuilder& outBuffer, - ClockSource* cs, - int64_t serverStartMs); - -// Declared in dur_journal.cpp -boost::filesystem::path getJournalDir(); -void preallocateFiles(); - -// Forward declaration -static void durThread(ClockSource* cs, int64_t serverStartMs); - -// Durability activity statistics -Stats stats; - -// Reference to the write intents tracking object -CommitJob commitJob; - -// Reference to the active durability interface -DurableInterface* DurableInterface::_impl(&nonDurableImpl); - - -// -// Stats -// - -Stats::Stats() : _currIdx(0) {} - -void Stats::reset() { - // Seal the current metrics - _stats[_currIdx]._durationMillis = _stats[_currIdx].getCurrentDurationMillis(); - - // Use a new metric - const unsigned newCurrIdx = (_currIdx + 1) % (sizeof(_stats) / sizeof(_stats[0])); - _stats[newCurrIdx].reset(); - - _currIdx = newCurrIdx; -} - -BSONObj Stats::asObj() const { - // Use the previous statistic - const S& stats = _stats[(_currIdx - 1) % (sizeof(_stats) / sizeof(_stats[0]))]; - - BSONObjBuilder builder; - stats._asObj(&builder); - - return builder.obj(); -} - -void Stats::S::reset() { - memset(this, 0, sizeof(*this)); - _startTimeMicros = curTimeMicros64(); -} - -std::string Stats::S::_CSVHeader() const { - return "cmts\t jrnMB\t wrDFMB\t cIWLk\t early\t prpLgB\t wrToJ\t wrToDF\t rmpPrVw"; -} - -std::string Stats::S::_asCSV() const { - stringstream ss; - ss << setprecision(2) << _commits << '\t' << _journaledBytes / 1000000.0 << '\t' - << _writeToDataFilesBytes / 1000000.0 << '\t' << _commitsInWriteLock << '\t' << 0 << '\t' - << (unsigned)(_prepLogBufferMicros / 1000) << '\t' - << (unsigned)(_writeToJournalMicros / 1000) << '\t' - << (unsigned)(_writeToDataFilesMicros / 1000) << '\t' - << (unsigned)(_remapPrivateViewMicros / 1000) << '\t' << (unsigned)(_commitsMicros / 1000) - << '\t' << (unsigned)(_commitsInWriteLockMicros / 1000) << '\t'; - - return ss.str(); -} - -void Stats::S::_asObj(BSONObjBuilder* builder) const { - BSONObjBuilder& b = *builder; - b << "commits" << _commits << "journaledMB" << _journaledBytes / 1000000.0 - << "writeToDataFilesMB" << _writeToDataFilesBytes / 1000000.0 << "compression" - << _journaledBytes / (_uncompressedBytes + 1.0) << "commitsInWriteLock" << _commitsInWriteLock - << "earlyCommits" << 0 << "timeMs" - << BSON("dt" << _durationMillis << "prepLogBuffer" << (unsigned)(_prepLogBufferMicros / 1000) - << "writeToJournal" - << (unsigned)(_writeToJournalMicros / 1000) - << "writeToDataFiles" - << (unsigned)(_writeToDataFilesMicros / 1000) - << "remapPrivateView" - << (unsigned)(_remapPrivateViewMicros / 1000) - << "commits" - << (unsigned)(_commitsMicros / 1000) - << "commitsInWriteLock" - << (unsigned)(_commitsInWriteLockMicros / 1000)); - - if (storageGlobalParams.journalCommitIntervalMs.load() != 0) { - b << "journalCommitIntervalMs" << storageGlobalParams.journalCommitIntervalMs.load(); - } -} - - -// -// DurableInterface -// - -DurableInterface::DurableInterface() {} - -DurableInterface::~DurableInterface() {} - - -// -// DurableImpl -// - -bool DurableImpl::commitNow(OperationContext* opCtx) { - CommitNotifier::When when = commitNotify.now(); - - AutoYieldFlushLockForMMAPV1Commit flushLockYield(opCtx->lockState()); - - // There is always just one waiting anyways - flushRequested.notify_one(); - - // commitNotify.waitFor ensures that whatever was scheduled for journaling before this - // call has been persisted to the journal file. This does not mean that this data has been - // applied to the shared view yet though, that's why we wait for applyToDataFilesNotify. - applyToDataFilesNotify.waitFor(when); - - return true; -} - -bool DurableImpl::waitUntilDurable() { - commitNotify.awaitBeyondNow(); - return true; -} - -void DurableImpl::createdFile(const std::string& filename, unsigned long long len) { - std::shared_ptr<DurOp> op(new FileCreatedOp(filename, len)); - commitJob.noteOp(op); -} - - -void DurableImpl::declareWriteIntents(const std::vector<std::pair<void*, unsigned>>& intents) { - typedef std::vector<std::pair<void*, unsigned>> Intents; - stdx::lock_guard<SimpleMutex> lk(commitJob.groupCommitMutex); - for (Intents::const_iterator it(intents.begin()), end(intents.end()); it != end; ++it) { - commitJob.note(it->first, it->second); - } -} - -bool DurableImpl::commitIfNeeded() { - if (MONGO_likely(commitJob.bytes() < UncommittedBytesLimit)) { - return false; - } - - // Just wake up the flush thread - flushRequested.notify_one(); - return true; -} - -void DurableImpl::syncDataAndTruncateJournal(OperationContext* opCtx) { - invariant(opCtx->lockState()->isW()); - - // Once this returns, all the outstanding journal has been applied to the data files and - // so it's safe to do the flushAll/journalCleanup below. - commitNow(opCtx); - - // Flush the shared view to disk. - MongoFile::flushAll(opCtx, true); - - // Once the shared view has been flushed, we do not need the journal files anymore. - journalCleanup(true); - - // Double check post-conditions - invariant(!haveJournalFiles()); -} - -void DurableImpl::closingFileNotification() { - // File is closing while there are unwritten changes - invariant(!commitJob.hasWritten(), - "journal warning files are closing outside locks with writes pending"); -} - -void DurableImpl::commitAndStopDurThread(OperationContext* opCtx) { - CommitNotifier::When when = commitNotify.now(); - - // There is always just one waiting anyways - flushRequested.notify_one(); - - // commitNotify.waitFor ensures that whatever was scheduled for journaling before this - // call has been persisted to the journal file. This does not mean that this data has been - // applied to the shared view yet though, that's why we wait for applyToDataFilesNotify. - applyToDataFilesNotify.waitFor(when); - - // Flush the shared view to disk. - MongoFile::flushAll(opCtx, true); - - // Once the shared view has been flushed, we do not need the journal files anymore. - journalCleanup(true); - - // Double check post-conditions - invariant(!haveJournalFiles()); - - shutdownRequested.store(1); - - // Wait for the durability thread to terminate - log() << "Terminating durability thread ..."; - _durThreadHandle.join(); -} - -void DurableImpl::start(ClockSource* cs, int64_t serverStartMs) { - // Start the durability thread - stdx::thread t(durThread, cs, serverStartMs); - _durThreadHandle.swap(t); -} - - -/** - * Remaps the private view from the shared view so that it does not consume too much - * copy-on-write/swap space. Must only be called after the in-memory journal has been flushed - * to disk and applied on top of the shared view. - * - * @param fraction Value between (0, 1] indicating what fraction of the memory to remap. - * Remapping too much or too frequently incurs copy-on-write page fault cost. - */ -static void remapPrivateView(OperationContext* opCtx, double fraction) { - // Remapping private views must occur after WRITETODATAFILES otherwise we wouldn't see any - // newly written data on reads. - invariant(!commitJob.hasWritten()); - - try { - Timer t; - remapPrivateViewImpl(opCtx, fraction); - stats.curr()->_remapPrivateViewMicros += t.micros(); - - LOG(4) << "remapPrivateView end"; - return; - } catch (DBException& e) { - severe() << "dbexception in remapPrivateView causing immediate shutdown: " << redact(e); - } catch (std::ios_base::failure& e) { - severe() << "ios_base exception in remapPrivateView causing immediate shutdown: " - << redact(e.what()); - } catch (std::bad_alloc& e) { - severe() << "bad_alloc exception in remapPrivateView causing immediate shutdown: " - << redact(e.what()); - } catch (std::exception& e) { - severe() << "exception in remapPrivateView causing immediate shutdown: " - << redact(e.what()); - } catch (...) { - severe() << "unknown exception in remapPrivateView causing immediate shutdown: "; - } - - MONGO_UNREACHABLE; -} - - -/** - * The main durability thread loop. There is a single instance of this function running. - */ -static void durThread(ClockSource* cs, int64_t serverStartMs) { - Client::initThread("durability"); - - log() << "Durability thread started"; - - bool samePartition = true; - try { - const std::string dbpathDir = boost::filesystem::path(storageGlobalParams.dbpath).string(); - samePartition = onSamePartition(getJournalDir().string(), dbpathDir); - } catch (...) { - } - - // Spawn the journal writer thread - JournalWriter journalWriter(&commitNotify, &applyToDataFilesNotify, NumAsyncJournalWrites); - journalWriter.start(); - - // Used as an estimate of how much / how fast to remap - uint64_t commitCounter(0); - uint64_t estimatedPrivateMapSize(0); - uint64_t remapLastTimestamp(0); - - while (shutdownRequested.loadRelaxed() == 0) { - unsigned ms = storageGlobalParams.journalCommitIntervalMs.load(); - if (ms == 0) { - ms = samePartition ? 100 : 30; - } - - // +1 so it never goes down to zero - const int64_t oneThird = (ms / 3) + 1; - - // Reset the stats based on the reset interval - if (stats.curr()->getCurrentDurationMillis() > DurStatsResetIntervalMillis) { - stats.reset(); - } - - try { - stdx::unique_lock<stdx::mutex> lock(flushMutex); - - for (unsigned i = 0; i <= 2; i++) { - if (stdx::cv_status::no_timeout == - flushRequested.wait_for(lock, Milliseconds(oneThird).toSystemDuration())) { - // Someone forced a flush - break; - } - - if (commitNotify.nWaiting()) { - // One or more getLastError j:true is pending - break; - } - - if (commitJob.bytes() > UncommittedBytesLimit / 2) { - // The number of written bytes is growing - break; - } - } - - // The commit logic itself - LOG(4) << "groupCommit begin"; - - Timer t; - - const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); - OperationContext& opCtx = *opCtxPtr; - AutoAcquireFlushLockForMMAPV1Commit autoFlushLock(opCtx.lockState()); - - // We need to snapshot the commitNumber after the flush lock has been obtained, - // because at this point we know that we have a stable snapshot of the data. - const CommitNotifier::When commitNumber(commitNotify.now()); - - LOG(4) << "Processing commit number " << commitNumber; - - if (!commitJob.hasWritten()) { - // We do not need the journal lock anymore. Free it here, for the really - // unlikely possibility that the writeBuffer command below blocks. - autoFlushLock.release(); - - // getlasterror request could have came after the data was already committed. - // No need to call committingReset though, because we have not done any - // writes (hasWritten == false). - JournalWriter::Buffer* const buffer = journalWriter.newBuffer(); - buffer->setNoop(); - buffer->journalListenerToken = getJournalListener()->getToken(); - - journalWriter.writeBuffer(buffer, commitNumber); - } else { - // This copies all the in-memory changes into the journal writer's buffer. - JournalWriter::Buffer* const buffer = journalWriter.newBuffer(); - PREPLOGBUFFER(buffer->getHeader(), buffer->getBuilder(), cs, serverStartMs); - - estimatedPrivateMapSize += commitJob.bytes(); - commitCounter++; - - // Now that the write intents have been copied to the buffer, the commit job is - // free to be reused. We need to reset the commit job's contents while under - // the S flush lock, because otherwise someone might have done a write and this - // would wipe out their changes without ever being committed. - commitJob.committingReset(); - - double systemMemoryPressurePercentage = - ProcessInfo::getSystemMemoryPressurePercentage(); - - // Now that the in-memory modifications have been collected, we can potentially - // release the flush lock if remap is not necessary. - // When we remap due to memory pressure, we look at two criteria - // 1. If the amount of 4k pages touched exceeds 512 MB, - // a reasonable estimate of memory pressure on Linux. - // 2. Check if the amount of free memory on the machine is running low, - // since #1 is underestimates the memory pressure on Windows since - // commits in 64MB chunks. - const bool shouldRemap = (estimatedPrivateMapSize >= UncommittedBytesLimit) || - (systemMemoryPressurePercentage > 0.0) || - (commitCounter % NumCommitsBeforeRemap == 0) || - (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalAlwaysRemap); - - double remapFraction = 0.0; - - if (shouldRemap) { - // We want to remap all private views about every 2 seconds. There could be - // ~1000 views so we do a little each pass. There will be copy on write - // faults after remapping, so doing a little bit at a time will avoid big - // load spikes when the pages are touched. - // - // TODO: Instead of the time-based logic above, consider using ProcessInfo - // and watching for getResidentSize to drop, which is more precise. - remapFraction = (curTimeMicros64() - remapLastTimestamp) / 2000000.0; - - if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalAlwaysRemap) { - remapFraction = 1; - } else { - // We don't want to get close to the UncommittedBytesLimit - const double remapMemFraction = - estimatedPrivateMapSize / ((double)UncommittedBytesLimit); - - remapFraction = std::max(remapMemFraction, remapFraction); - - remapFraction = std::max(systemMemoryPressurePercentage, remapFraction); - } - } else { - LOG(4) << "Early release flush lock"; - - // We will not be doing a remap so drop the flush lock. That way we will be - // doing the journal I/O outside of lock, so other threads can proceed. - invariant(!shouldRemap); - autoFlushLock.release(); - } - - buffer->journalListenerToken = getJournalListener()->getToken(); - // Request async I/O to the journal. This may block. - journalWriter.writeBuffer(buffer, commitNumber); - - // Data has now been written to the shared view. If remap was requested, we - // would still be holding the S flush lock here, so just upgrade it and - // perform the remap. - if (shouldRemap) { - // Need to wait for the previously scheduled journal writes to complete - // before any remap is attempted. - journalWriter.flush(); - journalWriter.assertIdle(); - - // Upgrading the journal lock to flush stops all activity on the system, - // because we will be remapping memory and we don't want readers to be - // accessing it. Technically this step could be avoided on systems, which - // support atomic remap. - autoFlushLock.upgradeFlushLockToExclusive(); - remapPrivateView(opCtxPtr.get(), remapFraction); - - autoFlushLock.release(); - - // Reset the private map estimate outside of the lock - estimatedPrivateMapSize = 0; - remapLastTimestamp = curTimeMicros64(); - - stats.curr()->_commitsInWriteLock++; - stats.curr()->_commitsInWriteLockMicros += t.micros(); - } - } - - stats.curr()->_commits++; - stats.curr()->_commitsMicros += t.micros(); - - LOG(4) << "groupCommit end"; - } catch (DBException& e) { - severe() << "dbexception in durThread causing immediate shutdown: " << redact(e); - MONGO_UNREACHABLE; - } catch (std::ios_base::failure& e) { - severe() << "ios_base exception in durThread causing immediate shutdown: " - << redact(e.what()); - MONGO_UNREACHABLE; - } catch (std::bad_alloc& e) { - severe() << "bad_alloc exception in durThread causing immediate shutdown: " - << redact(e.what()); - MONGO_UNREACHABLE; - } catch (std::exception& e) { - severe() << "exception in durThread causing immediate shutdown: " << redact(e.what()); - MONGO_UNREACHABLE; - } catch (...) { - severe() << "unhandled exception in durThread causing immediate shutdown"; - MONGO_UNREACHABLE; - } - } - - // Stops the journal thread and ensures everything was written - invariant(!commitJob.hasWritten()); - - journalWriter.flush(); - journalWriter.shutdown(); - - log() << "Durability thread stopped"; -} - - -/** - * Invoked at server startup. Recovers the database by replaying journal files and then - * starts the durability thread. - */ -void startup(ClockSource* cs, int64_t serverStartMs) { - if (!storageGlobalParams.dur) { - return; - } - - journalMakeDir(cs, serverStartMs); - - try { - replayJournalFilesAtStartup(); - } catch (DBException& e) { - severe() << "dbexception during recovery: " << redact(e); - throw; - } catch (std::exception& e) { - severe() << "std::exception during recovery: " << redact(e.what()); - throw; - } catch (...) { - severe() << "exception during recovery"; - throw; - } - - preallocateFiles(); - - durableImpl.start(cs, serverStartMs); - DurableInterface::_impl = &durableImpl; -} - -void setJournalListener(JournalListener* jl) { - stdx::unique_lock<stdx::mutex> lk(journalListenerMutex); - journalListener = jl; -} - -JournalListener* getJournalListener() { - stdx::unique_lock<stdx::mutex> lk(journalListenerMutex); - return journalListener; -} - -} // namespace dur -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/dur.h b/src/mongo/db/storage/mmap_v1/dur.h deleted file mode 100644 index 06b38255c25..00000000000 --- a/src/mongo/db/storage/mmap_v1/dur.h +++ /dev/null @@ -1,171 +0,0 @@ -/** - * Copyright (C) 2009-2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include <string> -#include <vector> - -#include "mongo/base/disallow_copying.h" -#include "mongo/db/storage/journal_listener.h" - -namespace mongo { - -class ClockSource; -class OperationContext; - -namespace dur { - -// a smaller limit is likely better on 32 bit -const unsigned UncommittedBytesLimit = (sizeof(void*) == 4) ? 50 * 1024 * 1024 : 512 * 1024 * 1024; - -class DurableInterface { - MONGO_DISALLOW_COPYING(DurableInterface); - -public: - virtual ~DurableInterface(); - - /** - * Declare that a file has been created. Normally writes are applied only after journaling - * for safety. But here the file is created first, and the journal will just replay the - * creation if the create didn't happen due to a crash. - */ - virtual void createdFile(const std::string& filename, unsigned long long len) = 0; - - // Declare write intents. Use these methods to declare "i'm about to write to x and it - // should be logged for redo." - // - // Failure to call declare write intents is checked in MONGO_CONFIG_DEBUG_BUILD mode by - // using a read only mapped view (i.e., you'll segfault if the code is covered in that - // situation). The debug check doesn't verify that your length is correct though. - virtual void declareWriteIntents(const std::vector<std::pair<void*, unsigned>>& intents) = 0; - - /** Wait for acknowledgement of the next group commit. - @return true if --dur is on. There will be delay. - @return false if --dur is off. - */ - virtual bool waitUntilDurable() = 0; - - /** Commit immediately. - - Generally, you do not want to do this often, as highly granular committing may affect - performance. - - Does not return until the commit is complete. - - You must be at least read locked when you call this. Ideally, you are not write locked - and then read operations can occur concurrently. - - Do not use this. Use commitIfNeeded() instead. - - @return true if --dur is on. - @return false if --dur is off. (in which case there is action) - */ - virtual bool commitNow(OperationContext* opCtx) = 0; - - /** Commit if enough bytes have been modified. Current threshold is 50MB - - The idea is that long running write operations that don't yield - (like creating an index) can call this whenever the db is in a sane state and it will - prevent commits from growing too large. - @return true if commited - */ - virtual bool commitIfNeeded() = 0; - - - /** - * Called when a DurableMappedFile is closing. Asserts that there are no unwritten changes, - * because that would mean journal replay on recovery would try to write to non-existent - * files and fail. - */ - virtual void closingFileNotification() = 0; - - /** - * Invoked at clean shutdown time. Performs one last commit/flush and terminates the - * flush thread. - * - * Must be called under the global X lock. - */ - virtual void commitAndStopDurThread(OperationContext* opCtx) = 0; - - /** - * Commits pending changes, flushes all changes to main data files, then removes the - * journal. - * - * WARNING: Data *must* be in a crash-recoverable state when this is called and must - * not be inside of a write unit of work. - * - * This is useful as a "barrier" to ensure that writes before this call will never go - * through recovery and be applied to files that have had changes made after this call - * applied. - */ - virtual void syncDataAndTruncateJournal(OperationContext* opCtx) = 0; - - virtual bool isDurable() const = 0; - - static DurableInterface& getDur() { - return *_impl; - } - -protected: - DurableInterface(); - -private: - friend void startup(ClockSource* cs, int64_t serverStartMs); - - static DurableInterface* _impl; -}; - - -/** - * Called during startup to startup the durability module. - * Does nothing if storageGlobalParams.dur is false - */ -void startup(ClockSource* cs, int64_t serverStartMs); - -// Sets a new JournalListener, which is used to alert the rest of the system about -// journaled write progress. -void setJournalListener(JournalListener* jl); - -// Expose the JournalListener, needed for the journal writer thread. -JournalListener* getJournalListener(); - -} // namespace dur - - -/** - * Provides a reference to the active durability interface. - * - * TODO: The only reason this is an inline function is that tests try to link it and fail if - * the MMAP V1 engine is not included. - */ -inline dur::DurableInterface& getDur() { - return dur::DurableInterface::getDur(); -} - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/dur_commitjob.cpp b/src/mongo/db/storage/mmap_v1/dur_commitjob.cpp deleted file mode 100644 index 6a8ca62f15d..00000000000 --- a/src/mongo/db/storage/mmap_v1/dur_commitjob.cpp +++ /dev/null @@ -1,119 +0,0 @@ -/* @file dur_commitjob.cpp */ - -/** -* Copyright (C) 2009 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/dur_commitjob.h" - -#include <iostream> - -#include "mongo/db/storage/mmap_v1/dur.h" -#include "mongo/db/storage/mmap_v1/dur_stats.h" -#include "mongo/db/storage/mmap_v1/durable_mapped_file.h" -#include "mongo/util/log.h" -#include "mongo/util/stacktrace.h" - -namespace mongo { - -using std::shared_ptr; -using std::endl; -using std::max; -using std::min; - -namespace dur { - -void WriteIntent::absorb(const WriteIntent& other) { - dassert(overlaps(other)); - - void* newStart = min(start(), other.start()); - p = max(p, other.p); - len = (char*)p - (char*)newStart; - - dassert(contains(other)); -} - - -CommitJob::CommitJob() : _hasWritten(false), _lastNotedPos(0), _bytes(0) {} - -CommitJob::~CommitJob() {} - -void CommitJob::noteOp(shared_ptr<DurOp> p) { - stdx::lock_guard<SimpleMutex> lk(groupCommitMutex); - _hasWritten = true; - _durOps.push_back(p); -} - -void CommitJob::note(void* p, int len) { - _hasWritten = true; - - if (!_alreadyNoted.checkAndSet(p, len)) { - // Remember intent. We will journal it in a bit. - _insertWriteIntent(p, len); - - // Round off to page address (4KB). - const size_t x = ((size_t)p) & ~0xfff; - - if (x != _lastNotedPos) { - _lastNotedPos = x; - - // Add the full page amount - _bytes += (len + 4095) & ~0xfff; - - if (_bytes > UncommittedBytesLimit * 3) { - _complains++; - - // Throttle logging - if (_complains < 100 || (curTimeMillis64() - _lastComplainMs >= 60000)) { - _lastComplainMs = curTimeMillis64(); - - warning() << "DR102 too much data written uncommitted (" << _bytes / 1000000.0 - << "MB)"; - - if (_complains < 10 || _complains % 10 == 0) { - printStackTrace(); - } - } - } - } - } -} - -void CommitJob::committingReset() { - _hasWritten = false; - _alreadyNoted.clear(); - _intents.clear(); - _durOps.clear(); - _bytes = 0; -} - -} // namespace "dur" -} // namespace "mongo" diff --git a/src/mongo/db/storage/mmap_v1/dur_commitjob.h b/src/mongo/db/storage/mmap_v1/dur_commitjob.h deleted file mode 100644 index 80d6cf900f5..00000000000 --- a/src/mongo/db/storage/mmap_v1/dur_commitjob.h +++ /dev/null @@ -1,224 +0,0 @@ -/** - * Copyright (C) 2009-2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - - -#include "mongo/db/storage/mmap_v1/durop.h" -#include "mongo/util/concurrency/mutex.h" - -namespace mongo { -namespace dur { - -typedef std::vector<std::shared_ptr<DurOp>> DurOpsVector; - -/** - * Declaration of an intent to write to a region of a memory mapped view. We store the end - * rather than the start pointer to make operator < faster since that is heavily used in - * set lookup. - */ -struct WriteIntent { - WriteIntent() : p(0) {} - WriteIntent(void* a, unsigned b) : p((char*)a + b), len(b) {} - - void* start() const { - return (char*)p - len; - } - void* end() const { - return p; - } - unsigned length() const { - return len; - } - bool operator<(const WriteIntent& rhs) const { - return end() < rhs.end(); - } - - bool overlaps(const WriteIntent& rhs) const { - return (start() <= rhs.end() && end() >= rhs.start()); - } - - bool contains(const WriteIntent& rhs) const { - return (start() <= rhs.start() && end() >= rhs.end()); - } - - // merge into me: - void absorb(const WriteIntent& other); - - friend std::ostream& operator<<(std::ostream& out, const WriteIntent& wi) { - return (out << "p: " << wi.p << " end: " << wi.end() << " len: " << wi.len); - } - -private: - void* p; // intent to write up to p - unsigned len; // up to this len -}; - -typedef std::vector<WriteIntent> WriteIntentsVector; - - -/** - * Bitmap to remember things we have already marked for journaling. False negatives are ok - * if infrequent, since they impact performance. - */ -template <int Prime> -class Already { - MONGO_DISALLOW_COPYING(Already); - -public: - Already() { - clear(); - } - - void clear() { - memset(this, 0, sizeof(*this)); - } - - /** - * Checks if we have Already recorded/indicated our write intent for this region of - * memory and automatically upgrades the length if the length was shorter previously. - * - * @return true if already indicated. - */ - bool checkAndSet(void* p, int len) { - const unsigned x = hashPointer(p); - std::pair<void*, int>& nd = nodes[x % Prime]; - - if (nd.first == p) { - if (nd.second < len) { - nd.second = len; - return false; // haven't indicated this len yet - } - return true; // already indicated - } - - nd.first = p; - nd.second = len; - return false; // a new set - } - -private: - static unsigned hashPointer(void* v) { - unsigned x = 0; - unsigned char* p = (unsigned char*)&v; - for (unsigned i = 0; i < sizeof(void*); i++) { - x = x * 131 + p[i]; - } - return x; - } - - std::pair<void*, int> nodes[Prime]; -}; - - -/** - * Tracks all write operations on the private view so they can be journaled. - */ -class CommitJob { - MONGO_DISALLOW_COPYING(CommitJob); - -public: - CommitJob(); - ~CommitJob(); - - /** - * Note an operation other than a "basic write". - */ - void noteOp(std::shared_ptr<DurOp> p); - - /** - * Record/note an intent to write. - * - * NOTE: Not thread safe. Requires the mutex to be locked. - */ - void note(void* p, int len); - - /** - * When this value is false we don't have to do any group commit. - */ - bool hasWritten() const { - return _hasWritten; - } - - /** - * We use the commitjob object over and over, calling committingReset() rather than - * reconstructing. - */ - void committingReset(); - - /** - * We check how much written and if it is getting to be a lot, we commit sooner. - */ - size_t bytes() const { - return _bytes; - } - - /** - * Sorts the internal list of write intents so that overlapping and duplicate items can be - * merged. We do the sort here so the caller receives something they must keep const from - * their POV. - */ - const WriteIntentsVector& getIntentsSorted() { - sort(_intents.begin(), _intents.end()); - return _intents; - } - - const DurOpsVector& ops() const { - return _durOps; - } - - SimpleMutex groupCommitMutex; - -private: - void _insertWriteIntent(void* p, int len) { - _intents.push_back(WriteIntent(p, len)); - } - - - // Whether we put write intents or durops - bool _hasWritten; - - // Write intents along with a bitmask for whether we have already noted them - Already<127> _alreadyNoted; - WriteIntentsVector _intents; - - // All the ops other than basic writes - DurOpsVector _durOps; - - // Used to count the private map used bytes. Note that _lastNotedPos doesn't reset with - // each commit, but that is ok we aren't being that precise. - size_t _lastNotedPos; - size_t _bytes; - - // Warning logging for large commits - uint64_t _lastComplainMs; - unsigned _complains; -}; - -} // namespace "dur" -} // namespace "mongo" diff --git a/src/mongo/db/storage/mmap_v1/dur_journal.cpp b/src/mongo/db/storage/mmap_v1/dur_journal.cpp deleted file mode 100644 index bfb39a0bc6c..00000000000 --- a/src/mongo/db/storage/mmap_v1/dur_journal.cpp +++ /dev/null @@ -1,826 +0,0 @@ -// @file dur_journal.cpp writing to the writeahead logging journal - -/** -* Copyright (C) 2010 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kJournal - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/dur_journal.h" - -#include <boost/filesystem.hpp> -#include <boost/filesystem/operations.hpp> - -#include "mongo/base/init.h" -#include "mongo/base/static_assert.h" -#include "mongo/config.h" -#include "mongo/db/client.h" -#include "mongo/db/storage/mmap_v1/aligned_builder.h" -#include "mongo/db/storage/mmap_v1/compress.h" -#include "mongo/db/storage/mmap_v1/dur_journalformat.h" -#include "mongo/db/storage/mmap_v1/dur_journalimpl.h" -#include "mongo/db/storage/mmap_v1/dur_stats.h" -#include "mongo/db/storage/mmap_v1/logfile.h" -#include "mongo/db/storage/mmap_v1/mmap.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" -#include "mongo/db/storage/mmap_v1/paths.h" -#include "mongo/db/storage/storage_options.h" -#include "mongo/platform/random.h" -#include "mongo/util/checksum.h" -#include "mongo/util/clock_source.h" -#include "mongo/util/exit.h" -#include "mongo/util/file.h" -#include "mongo/util/hex.h" -#include "mongo/util/log.h" -#include "mongo/util/mongoutils/str.h" -#include "mongo/util/progress_meter.h" -#include "mongo/util/timer.h" - -using namespace mongoutils; - -namespace mongo { - -using std::endl; -using std::hex; -using std::string; - -class AlignedBuilder; - -namespace dur { -// Rotate after reaching this data size in a journal (j._<n>) file -// We use a smaller size for 32 bit as the journal is mmapped during recovery (only) -// Note if you take a set of datafiles, including journal files, from 32->64 or vice-versa, it must -// work. (and should as-is) -// --smallfiles makes the limit small. - -#if defined(MONGO_CONFIG_DEBUG_BUILD) -unsigned long long DataLimitPerJournalFile = 128 * 1024 * 1024; -#elif defined(__APPLE__) -// assuming a developer box if OS X -unsigned long long DataLimitPerJournalFile = 256 * 1024 * 1024; -#else -unsigned long long DataLimitPerJournalFile = - (sizeof(void*) == 4) ? 256 * 1024 * 1024 : 1 * 1024 * 1024 * 1024; -#endif - -MONGO_INITIALIZER(InitializeJournalingParams)(InitializerContext* context) { - if (mmapv1GlobalOptions.smallfiles == true) { - verify(dur::DataLimitPerJournalFile >= 128 * 1024 * 1024); - dur::DataLimitPerJournalFile = 128 * 1024 * 1024; - } - return Status::OK(); -} - -MONGO_STATIC_ASSERT(sizeof(Checksum) == 16); -MONGO_STATIC_ASSERT(sizeof(JHeader) == 8192); -MONGO_STATIC_ASSERT(sizeof(JSectHeader) == 20); -MONGO_STATIC_ASSERT(sizeof(JSectFooter) == 32); -MONGO_STATIC_ASSERT(sizeof(JEntry) == 12); -MONGO_STATIC_ASSERT(sizeof(LSNFile) == 88); - -bool usingPreallocate = false; - -void removeOldJournalFile(boost::filesystem::path p); - -boost::filesystem::path getJournalDir() { - boost::filesystem::path p(storageGlobalParams.dbpath); - p /= "journal"; - return p; -} - -boost::filesystem::path lsnPath() { - return getJournalDir() / "lsn"; -} - -/** this should be called when something really bad happens so that we can flag appropriately -*/ -void journalingFailure(const char* msg) { - /** todo: - (1) don't log too much - (2) make an indicator in the journal dir that something bad happened. - (2b) refuse to do a recovery startup if that is there without manual override. - */ - log() << "journaling failure/error: " << redact(msg) << endl; - verify(false); -} - -JSectFooter::JSectFooter() { - memset(this, 0, sizeof(*this)); - sentinel = JEntry::OpCode_Footer; -} - -JSectFooter::JSectFooter(const void* begin, int len) { // needs buffer to compute hash - sentinel = JEntry::OpCode_Footer; - reserved = 0; - magic[0] = magic[1] = magic[2] = magic[3] = '\n'; - - Checksum c; - c.gen(begin, (unsigned)len); - memcpy(hash, c.bytes, sizeof(hash)); -} - -bool JSectFooter::checkHash(const void* begin, int len) const { - if (!magicOk()) { - log() << "journal footer not valid" << endl; - return false; - } - Checksum c; - c.gen(begin, len); - DEV log() << "checkHash len:" << len << " hash:" << toHex(hash, 16) - << " current:" << toHex(c.bytes, 16) << endl; - if (memcmp(hash, c.bytes, sizeof(hash)) == 0) - return true; - log() << "journal checkHash mismatch, got: " << toHex(c.bytes, 16) - << " expected: " << toHex(hash, 16) << endl; - return false; -} - -namespace { -std::unique_ptr<SecureRandom> mySecureRandom; -stdx::mutex mySecureRandomMutex; -int64_t getMySecureRandomNumber() { - stdx::lock_guard<stdx::mutex> lk(mySecureRandomMutex); - if (!mySecureRandom) - mySecureRandom = SecureRandom::create(); - return mySecureRandom->nextInt64(); -} -} - -JHeader::JHeader(string fname) { - magic[0] = 'j'; - magic[1] = '\n'; - _version = CurrentVersion; - memset(ts, 0, sizeof(ts)); - time_t t = time(0); - strncpy(ts, time_t_to_String_short(t).c_str(), sizeof(ts) - 1); - memset(dbpath, 0, sizeof(dbpath)); - strncpy(dbpath, fname.c_str(), sizeof(dbpath) - 1); - { - fileId = t & 0xffffffff; - fileId |= static_cast<unsigned long long>(getMySecureRandomNumber()) << 32; - } - memset(reserved3, 0, sizeof(reserved3)); - txt2[0] = txt2[1] = '\n'; - n1 = n2 = n3 = n4 = '\n'; -} - -Journal j; - -const unsigned long long LsnShutdownSentinel = ~((unsigned long long)0); - -Journal::Journal() : _written(0), _nextFileNumber(0), _curLogFile(0), _curFileId(0) { - _lastSeqNumberWrittenToSharedView.store(0); - _preFlushTime.store(0); - _lastFlushTime.store(0); - _writeToLSNNeeded.store(false); -} - -boost::filesystem::path Journal::getFilePathFor(int filenumber) const { - boost::filesystem::path p(dir); - p /= string(str::stream() << "j._" << filenumber); - return p; -} - -/** never throws - @param anyFiles by default we only look at j._* files. If anyFiles is true, return true - if there are any files in the journal directory. checkForUncleanShutdown() uses this to - make sure that the journal directory is mounted. - @return true if journal dir is not empty -*/ -bool haveJournalFiles(bool anyFiles) { - try { - boost::filesystem::path jdir = getJournalDir(); - if (!boost::filesystem::exists(jdir)) - return false; - - for (boost::filesystem::directory_iterator i(jdir); - i != boost::filesystem::directory_iterator(); - ++i) { - string fileName = boost::filesystem::path(*i).leaf().string(); - if (anyFiles || str::startsWith(fileName, "j._")) - return true; - } - } catch (const std::exception& e) { - log() << "Unable to check for journal files due to: " << e.what() << endl; - } - return false; -} - -/** throws */ -void removeJournalFiles() { - log() << "removeJournalFiles" << endl; - try { - for (boost::filesystem::directory_iterator i(getJournalDir()); - i != boost::filesystem::directory_iterator(); - ++i) { - string fileName = boost::filesystem::path(*i).leaf().string(); - if (str::startsWith(fileName, "j._")) { - try { - removeOldJournalFile(*i); - } catch (std::exception& e) { - log() << "couldn't remove " << fileName << ' ' << e.what() << endl; - throw; - } - } - } - try { - boost::filesystem::remove(lsnPath()); - } catch (...) { - // std::exception details logged in catch below - log() << "couldn't remove " << lsnPath().string() << endl; - throw; - } - } catch (std::exception& e) { - log() << "error removing journal files " << e.what() << endl; - throw; - } - verify(!haveJournalFiles()); - - flushMyDirectory(getJournalDir() / - "file"); // flushes parent of argument (in this case journal dir) - - LOG(1) << "removeJournalFiles end" << endl; -} - -/** at clean shutdown */ -bool okToCleanUp = false; // successful recovery would set this to true -void Journal::cleanup(bool _log) { - if (!okToCleanUp) - return; - - if (_log) - log() << "journalCleanup..." << endl; - try { - stdx::lock_guard<SimpleMutex> lk(_curLogFileMutex); - closeCurrentJournalFile(); - removeJournalFiles(); - } catch (std::exception& e) { - log() << "error couldn't remove journal file during shutdown " << e.what() << endl; - throw; - } -} -void journalCleanup(bool log) { - j.cleanup(log); -} - -bool _preallocateIsFaster() { - bool faster = false; - boost::filesystem::path p = getJournalDir() / "tempLatencyTest"; - if (boost::filesystem::exists(p)) { - try { - remove(p); - } catch (const std::exception& e) { - log() << "Unable to remove temporary file due to: " << e.what() << endl; - } - } - try { - AlignedBuilder b(8192); - int millis[2]; - const int N = 50; - for (int pass = 0; pass < 2; pass++) { - LogFile f(p.string()); - Timer t; - for (int i = 0; i < N; i++) { - f.synchronousAppend(b.buf(), 8192); - } - millis[pass] = t.millis(); - // second time through, file exists and is prealloc case - } - int diff = millis[0] - millis[1]; - if (diff > 2 * N) { - // at least 2ms faster for prealloc case? - faster = true; - log() << "preallocateIsFaster=true " << diff / (1.0 * N) << endl; - } - } catch (const std::exception& e) { - log() << "info preallocateIsFaster couldn't run due to: " << e.what() << "; returning false" - << endl; - } - if (boost::filesystem::exists(p)) { - try { - remove(p); - } catch (const std::exception& e) { - log() << "Unable to remove temporary file due to: " << e.what() << endl; - } - } - return faster; -} -bool preallocateIsFaster() { - Timer t; - bool res = false; - if (_preallocateIsFaster() && _preallocateIsFaster()) { - // maybe system is just super busy at the moment? sleep a second to let it calm down. - // deciding to to prealloc is a medium big decision: - sleepsecs(1); - res = _preallocateIsFaster(); - } - if (t.millis() > 3000) - log() << "preallocateIsFaster check took " << t.millis() / 1000.0 << " secs" << endl; - return res; -} - -// throws -void preallocateFile(boost::filesystem::path p, unsigned long long len) { - if (exists(p)) - return; - - log() << "preallocating a journal file " << p.string() << endl; - - const unsigned BLKSZ = 1024 * 1024; - verify(len % BLKSZ == 0); - - AlignedBuilder b(BLKSZ); - memset((void*)b.buf(), 0, BLKSZ); - - ProgressMeter m(len, 3 /*secs*/, 10 /*hits between time check (once every 6.4MB)*/); - m.setName("File Preallocator Progress"); - - File f; - f.open(p.string().c_str(), /*read-only*/ false, /*direct-io*/ false); - verify(f.is_open()); - fileofs loc = 0; - while (loc < len) { - f.write(loc, b.buf(), BLKSZ); - loc += BLKSZ; - m.hit(BLKSZ); - } - verify(loc == len); - f.fsync(); -} - -const int NUM_PREALLOC_FILES = 3; -inline boost::filesystem::path preallocPath(int n) { - verify(n >= 0); - verify(n < NUM_PREALLOC_FILES); - string fn = str::stream() << "prealloc." << n; - return getJournalDir() / fn; -} - -// throws -void _preallocateFiles() { - for (int i = 0; i < NUM_PREALLOC_FILES; i++) { - boost::filesystem::path filepath = preallocPath(i); - - unsigned long long limit = DataLimitPerJournalFile; - if (kDebugBuild && i == 1) { - // moving 32->64, the prealloc files would be short. that is "ok", but we - // want to exercise that case, so we force exercising here when - // MONGO_CONFIG_DEBUG_BUILD is set by arbitrarily stopping prealloc at a - // low limit for a file. also we want to be able to change in the future - // the constant without a lot of work anyway. - limit = 16 * 1024 * 1024; - } - preallocateFile(filepath, limit); - } -} - -void checkFreeSpace() { - unsigned long long spaceNeeded = - static_cast<unsigned long long>(3 * DataLimitPerJournalFile * 1.1); // add 10% for headroom - unsigned long long freeSpace = File::freeSpace(getJournalDir().string()); - unsigned long long prealloced = 0; - for (int i = 0; i < NUM_PREALLOC_FILES; i++) { - boost::filesystem::path filepath = preallocPath(i); - if (exists(filepath)) - prealloced += file_size(filepath); - } - - if (freeSpace + prealloced < spaceNeeded) { - log() << endl; - error() << "Insufficient free space for journal files" << endl; - log() << "Please make at least " << spaceNeeded / (1024 * 1024) << "MB available in " - << getJournalDir().string() << " or use --smallfiles" << endl; - log() << endl; - uasserted(15926, "Insufficient free space for journals"); - } -} - -void preallocateFiles() { - if (!(mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalNoCheckSpace)) - checkFreeSpace(); - - if (exists(preallocPath(0)) || // if enabled previously, keep using - exists(preallocPath(1)) || - (mmapv1GlobalOptions.preallocj && preallocateIsFaster())) { - usingPreallocate = true; - try { - _preallocateFiles(); - } catch (const std::exception& e) { - log() << "warning caught exception (" << e.what() << ") in preallocateFiles, continuing" - << endl; - } - } - j.open(); -} - -void removeOldJournalFile(boost::filesystem::path p) { - if (usingPreallocate) { - try { - for (int i = 0; i < NUM_PREALLOC_FILES; i++) { - boost::filesystem::path filepath = preallocPath(i); - if (!boost::filesystem::exists(filepath)) { - // we can recycle this file into this prealloc file location - boost::filesystem::path temppath = filepath.string() + ".temp"; - boost::filesystem::rename(p, temppath); - { - // zero the header - File f; - f.open(temppath.string().c_str(), false, false); - char buf[8192]; - memset(buf, 0, 8192); - f.write(0, buf, 8192); - f.truncate(DataLimitPerJournalFile); - f.fsync(); - } - log() << "old journal file " << p.string() << " will be reused as " - << filepath.string(); - boost::filesystem::rename(temppath, filepath); - return; - } - } - } catch (const std::exception& e) { - log() << "warning exception in dur::removeOldJournalFile " << p.string() << ": " - << e.what() << endl; - // fall through and try to delete the file - } - } - - // already have 3 prealloc files, so delete this file - try { - log() << "old journal file will be removed: " << p.string() << endl; - boost::filesystem::remove(p); - } catch (const std::exception& e) { - log() << "warning exception removing " << p.string() << ": " << e.what() << endl; - } -} - -// find a prealloc.<n> file, presumably to take and use -boost::filesystem::path findPrealloced() { - try { - for (int i = 0; i < NUM_PREALLOC_FILES; i++) { - boost::filesystem::path filepath = preallocPath(i); - if (boost::filesystem::exists(filepath)) - return filepath; - } - } catch (const std::exception& e) { - log() << "warning exception in dur::findPrealloced(): " << e.what() << endl; - } - return boost::filesystem::path(); -} - -/** assure journal/ dir exists. throws. call during startup. */ -void journalMakeDir(ClockSource* cs, int64_t serverStartMs) { - j.init(cs, serverStartMs); - - boost::filesystem::path p = getJournalDir(); - j.dir = p.string(); - log() << "journal dir=" << j.dir << endl; - if (!boost::filesystem::exists(j.dir)) { - try { - boost::filesystem::create_directory(j.dir); - } catch (std::exception& e) { - log() << "error creating directory " << j.dir << ' ' << e.what() << endl; - throw; - } - } -} - -void Journal::_open() { - _curFileId = 0; - verify(_curLogFile == 0); - boost::filesystem::path fname = getFilePathFor(_nextFileNumber); - - // if we have a prealloced file, use it - { - boost::filesystem::path p = findPrealloced(); - if (!p.empty()) { - try { - { - // JHeader::fileId must be updated before renaming to be race-safe - LogFile f(p.string()); - JHeader h(p.string()); - AlignedBuilder b(8192); - b.appendStruct(h); - f.synchronousAppend(b.buf(), b.len()); - } - boost::filesystem::rename(p, fname); - } catch (const std::exception& e) { - log() << "warning couldn't write to / rename file " << p.string() << ": " - << e.what() << endl; - } - } - } - - _curLogFile = new LogFile(fname.string()); - _nextFileNumber++; - { - JHeader h(fname.string()); - _curFileId = h.fileId; - verify(_curFileId); - AlignedBuilder b(8192); - b.appendStruct(h); - _curLogFile->synchronousAppend(b.buf(), b.len()); - } -} - -void Journal::init(ClockSource* cs, int64_t serverStartMs) { - verify(_curLogFile == 0); - _clock = cs; - _serverStartMs = serverStartMs; -} - -void Journal::open() { - stdx::lock_guard<SimpleMutex> lk(_curLogFileMutex); - _open(); -} - -void LSNFile::set(unsigned long long x) { - memset(this, 0, sizeof(*this)); - lsn = x; - checkbytes = ~x; -} - -/** logs details of the situation, and returns 0, if anything surprising in the LSNFile - if something highly surprising, throws to abort -*/ -unsigned long long LSNFile::get() { - uassert(13614, - str::stream() << "unexpected version number of lsn file in journal/ directory got: " - << ver, - ver == 0); - if (~lsn != checkbytes) { - log() << "lsnfile not valid. recovery will be from log start. lsn: " << hex << lsn - << " checkbytes: " << hex << checkbytes << endl; - return 0; - } - return lsn; -} - -/** called during recovery (the error message text below assumes that) -*/ -unsigned long long journalReadLSN() { - if (!exists(lsnPath())) { - log() << "info no lsn file in journal/ directory" << endl; - return 0; - } - - try { - // os can flush as it likes. if it flushes slowly, we will just do extra work on recovery. - // however, given we actually close the file when writing, that seems unlikely. - LSNFile L; - File f; - f.open(lsnPath().string().c_str()); - verify(f.is_open()); - if (f.len() == 0) { - // this could be 'normal' if we crashed at the right moment - log() << "info lsn file is zero bytes long" << endl; - return 0; - } - f.read(0, (char*)&L, sizeof(L)); - unsigned long long lsn = L.get(); - return lsn; - } catch (std::exception& e) { - uasserted(13611, - str::stream() << "can't read lsn file in journal directory : " << e.what()); - } - return 0; -} - -/** remember "last sequence number" to speed recoveries - concurrency: called by durThread only. -*/ -void Journal::updateLSNFile(unsigned long long lsnOfCurrentJournalEntry) { - if (!_writeToLSNNeeded.load()) - return; - _writeToLSNNeeded.store(false); - try { - // Don't read from _lastFlushTime again in this function since it may change. - const uint64_t copyOfLastFlushTime = _lastFlushTime.load(); - - // Only write an LSN that is older than the journal entry we are in the middle of writing. - // If this trips, it means that _lastFlushTime got ahead of what is actually in the data - // files because lsnOfCurrentJournalEntry includes data that hasn't yet been written to the - // data files. - if (copyOfLastFlushTime >= lsnOfCurrentJournalEntry) { - severe() << "Attempting to update LSNFile to " << copyOfLastFlushTime - << " which is not older than the current journal sequence number " - << lsnOfCurrentJournalEntry; - fassertFailed(34370); - } - - // os can flush as it likes. if it flushes slowly, we will just do extra work on recovery. - // however, given we actually close the file, that seems unlikely. - File f; - f.open(lsnPath().string().c_str()); - if (!f.is_open()) { - // can get 0 if an i/o error - log() << "warning: open of lsn file failed" << endl; - return; - } - LOG(1) << "lsn set " << copyOfLastFlushTime << endl; - LSNFile lsnf; - lsnf.set(copyOfLastFlushTime); - f.write(0, (char*)&lsnf, sizeof(lsnf)); - // do we want to fsync here? if we do it probably needs to be async so the durthread - // is not delayed. - } catch (std::exception& e) { - log() << "warning: write to lsn file failed " << e.what() << endl; - // keep running (ignore the error). recovery will be slow. - } -} - -namespace { -stdx::mutex lastGeneratedSeqNumberMutex; -uint64_t lastGeneratedSeqNumber = 0; -} - -uint64_t generateNextSeqNumber(ClockSource* cs, int64_t serverStartMs) { - const uint64_t now = cs->now().toMillisSinceEpoch() - serverStartMs; - - stdx::lock_guard<stdx::mutex> lock(lastGeneratedSeqNumberMutex); - if (now > lastGeneratedSeqNumber) { - lastGeneratedSeqNumber = now; - } else { - // Make sure we return unique monotonically increasing numbers. - lastGeneratedSeqNumber++; - } - return lastGeneratedSeqNumber; -} - -void setLastSeqNumberWrittenToSharedView(uint64_t seqNumber) { - j._lastSeqNumberWrittenToSharedView.store(seqNumber); -} - -void notifyPreDataFileFlush() { - j._preFlushTime.store(j._lastSeqNumberWrittenToSharedView.load()); -} - -void notifyPostDataFileFlush() { - j._lastFlushTime.store(j._preFlushTime.load()); - j._writeToLSNNeeded.store(true); -} - -// call from within _curLogFileMutex -void Journal::closeCurrentJournalFile() { - if (!_curLogFile) - return; - - JFile jf; - jf.filename = _curLogFile->_name; - jf.lastEventTimeMs = generateNextSeqNumber(_clock, _serverStartMs); - _oldJournalFiles.push_back(jf); - - delete _curLogFile; // close - _curLogFile = 0; - _written = 0; -} - -/** remove older journal files. - be in _curLogFileMutex but not dbMutex when calling -*/ -void Journal::removeUnneededJournalFiles() { - while (!_oldJournalFiles.empty()) { - JFile f = _oldJournalFiles.front(); - - // 'f.lastEventTimeMs' is the timestamp of the last thing in the journal file. - // '_lastFlushTime' is the start time of the last successful flush of the data files to - // disk. We can't delete this journal file until the last successful flush time is at least - // 10 seconds after 'f.lastEventTimeMs'. - if (f.lastEventTimeMs + ExtraKeepTimeMs < _lastFlushTime.load()) { - // eligible for deletion - boost::filesystem::path p(f.filename); - removeOldJournalFile(p); - } else { - break; - } - - _oldJournalFiles.pop_front(); - } -} - -void Journal::_rotate(unsigned long long lsnOfCurrentJournalEntry) { - if (globalInShutdownDeprecated() || !_curLogFile) - return; - - j.updateLSNFile(lsnOfCurrentJournalEntry); - - if (_curLogFile && _written < DataLimitPerJournalFile) - return; - - if (_curLogFile) { - _curLogFile->truncate(); - closeCurrentJournalFile(); - removeUnneededJournalFiles(); - } - - try { - Timer t; - _open(); - int ms = t.millis(); - if (ms >= 200) { - log() << "DR101 latency warning on journal file open " << ms << "ms" << endl; - } - } catch (std::exception& e) { - log() << "warning exception opening journal file " << e.what() << endl; - throw; - } -} - -/** write (append) the buffer we have built to the journal and fsync it. - outside of dbMutex lock as this could be slow. - @param uncompressed - a buffer that will be written to the journal after compression - will not return until on disk -*/ -void WRITETOJOURNAL(const JSectHeader& h, const AlignedBuilder& uncompressed) { - Timer t; - j.journal(h, uncompressed); - stats.curr()->_writeToJournalMicros += t.micros(); -} - -void Journal::journal(const JSectHeader& h, const AlignedBuilder& uncompressed) { - static AlignedBuilder b(32 * 1024 * 1024); - /* buffer to journal will be - JSectHeader - compressed operations - JSectFooter - */ - const unsigned headTailSize = sizeof(JSectHeader) + sizeof(JSectFooter); - const unsigned max = maxCompressedLength(uncompressed.len()) + headTailSize; - b.reset(max); - - { - dassert(h.sectionLen() == (unsigned)0xffffffff); // we will backfill later - b.appendStruct(h); - } - - size_t compressedLength = 0; - rawCompress(uncompressed.buf(), uncompressed.len(), b.cur(), &compressedLength); - verify(compressedLength < 0xffffffff); - verify(compressedLength < max); - b.skip(compressedLength); - - // footer - unsigned L = 0xffffffff; - { - // pad to alignment, and set the total section length in the JSectHeader - verify(0xffffe000 == (~(Alignment - 1))); - unsigned lenUnpadded = b.len() + sizeof(JSectFooter); - L = (lenUnpadded + Alignment - 1) & (~(Alignment - 1)); - dassert(L >= lenUnpadded); - - ((JSectHeader*)b.atOfs(0))->setSectionLen(lenUnpadded); - - JSectFooter f(b.buf(), b.len()); // computes checksum - b.appendStruct(f); - dassert(b.len() == lenUnpadded); - - b.skip(L - lenUnpadded); - dassert(b.len() % Alignment == 0); - } - - try { - stdx::lock_guard<SimpleMutex> lk(_curLogFileMutex); - - // must already be open -- so that _curFileId is correct for previous buffer building - verify(_curLogFile); - - stats.curr()->_uncompressedBytes += uncompressed.len(); - unsigned w = b.len(); - _written += w; - verify(w <= L); - stats.curr()->_journaledBytes += L; - _curLogFile->synchronousAppend((const void*)b.buf(), L); - _rotate(h.seqNumber); - } catch (std::exception& e) { - log() << "error exception in dur::journal " << e.what() << endl; - throw; - } -} -} -} diff --git a/src/mongo/db/storage/mmap_v1/dur_journal.h b/src/mongo/db/storage/mmap_v1/dur_journal.h deleted file mode 100644 index e1da1b65818..00000000000 --- a/src/mongo/db/storage/mmap_v1/dur_journal.h +++ /dev/null @@ -1,100 +0,0 @@ -// @file dur_journal.h - -/** -* Copyright (C) 2010 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include <cstdint> - -namespace mongo { - -class AlignedBuilder; -class ClockSource; -class JSectHeader; - -namespace dur { - -/** true if ok to cleanup journal files at termination. otherwise, files journal will be retained. -*/ -extern bool okToCleanUp; - -/** at termination after db files closed & fsynced - also after recovery - closes and removes journal files - @param log report in log that we are cleaning up if we actually do any work -*/ -void journalCleanup(bool log = false); - -/** assure journal/ dir exists. throws */ -void journalMakeDir(ClockSource* cs, int64_t serverStartMs); - -/** - * Generates the next sequence number for use in the journal, guaranteed to be greater than all - * prior sequence numbers. - */ -uint64_t generateNextSeqNumber(ClockSource* cs, int64_t serverStartMs); - -/** - * Informs the journaling system that all writes on or before the passed in sequence number have - * been written to the data files' shared mmap view. - */ -void setLastSeqNumberWrittenToSharedView(uint64_t seqNumber); - -/** flag that something has gone wrong during writing to the journal - (not for recovery mode) -*/ -void journalingFailure(const char* msg); - -/** read lsn from disk from the last run before doing recovery */ -unsigned long long journalReadLSN(); - -/** never throws. - @param anyFiles by default we only look at j._* files. If anyFiles is true, return true - if there are any files in the journal directory. checkForUncleanShutdown() uses this to - make sure that the journal directory is mounted. - @return true if there are any journal files in the journal dir. -*/ -bool haveJournalFiles(bool anyFiles = false); - -/** - * Writes the specified uncompressed buffer to the journal. - */ -void WRITETOJOURNAL(const JSectHeader& h, const AlignedBuilder& uncompressed); - -// in case disk controller buffers writes -const long long ExtraKeepTimeMs = 10000; - -/** - * Call these before (pre) and after (post) the datafiles are flushed to disk by the DataFileSync - * thread. These should not be called for any other flushes. - */ -void notifyPreDataFileFlush(); -void notifyPostDataFileFlush(); -} // namespace dur -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/dur_journal_writer.cpp b/src/mongo/db/storage/mmap_v1/dur_journal_writer.cpp deleted file mode 100644 index 3b244c25006..00000000000 --- a/src/mongo/db/storage/mmap_v1/dur_journal_writer.cpp +++ /dev/null @@ -1,307 +0,0 @@ -/** - * Copyright (C) 2015 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kJournal - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/dur_journal_writer.h" - -#include "mongo/db/client.h" -#include "mongo/db/storage/mmap_v1/dur.h" -#include "mongo/db/storage/mmap_v1/dur_journal.h" -#include "mongo/db/storage/mmap_v1/dur_recover.h" -#include "mongo/db/storage/mmap_v1/dur_stats.h" -#include "mongo/stdx/functional.h" -#include "mongo/stdx/thread.h" -#include "mongo/util/concurrency/idle_thread_block.h" -#include "mongo/util/log.h" -#include "mongo/util/timer.h" - -namespace mongo { -namespace dur { - -namespace { - -/** - * Apply the writes back to the non-private MMF after they are for certain in the journal. - * - * (1) TODO we don't need to write back everything every group commit. We MUST write back that - * which is going to be a remapped on its private view - but that might not be all views. - * - * (2) TODO should we do this using N threads? Would be quite easy see Hackenberg paper table - * 5 and 6. 2 threads might be a good balance. - */ -void WRITETODATAFILES(OperationContext* opCtx, - const JSectHeader& h, - const AlignedBuilder& uncompressed) { - Timer t; - - LOG(4) << "WRITETODATAFILES BEGIN"; - - RecoveryJob::get().processSection(opCtx, &h, uncompressed.buf(), uncompressed.len(), NULL); - - const long long m = t.micros(); - stats.curr()->_writeToDataFilesMicros += m; - - setLastSeqNumberWrittenToSharedView(h.seqNumber); - - LOG(4) << "journal WRITETODATAFILES " << m / 1000.0 << "ms"; -} - -} // namespace - - -/** - * Used inside the journal writer thread to ensure that used buffers are cleaned up properly. - */ -class BufferGuard { - MONGO_DISALLOW_COPYING(BufferGuard); - -public: - BufferGuard(JournalWriter::Buffer* buffer, JournalWriter::BufferQueue* bufferQueue) - : _buffer(buffer), _bufferQueue(bufferQueue) {} - - ~BufferGuard() { - // This buffer is done. Reset and remove it from the journal queue and put it on - // the ready queue. - _buffer->_reset(); - - // This should never block. Otherwise we will stall the journaling pipeline - // permanently and cause deadlock. - invariant(_bufferQueue->count() < _bufferQueue->maxSize()); - _bufferQueue->push(_buffer); - } - -private: - // Buffer that this scoped object is managing. Owned until destruction time. Then, the - // bufferQueue owns it. - JournalWriter::Buffer* const _buffer; - - // Queue where the buffer should be returned to at destruction time. Not owned. - JournalWriter::BufferQueue* const _bufferQueue; -}; - - -// -// JournalWriter -// - -JournalWriter::JournalWriter(CommitNotifier* commitNotify, - CommitNotifier* applyToDataFilesNotify, - size_t numBuffers) - : _commitNotify(commitNotify), - _applyToDataFilesNotify(applyToDataFilesNotify), - _shutdownRequested(false), - _journalQueue(numBuffers), - _lastCommitNumber(0), - _readyQueue(numBuffers) { - invariant(_journalQueue.maxSize() == _readyQueue.maxSize()); -} - -JournalWriter::~JournalWriter() { - // Never close the journal writer with outstanding or unaccounted writes - invariant(_journalQueue.empty()); - invariant(_readyQueue.empty()); -} - -void JournalWriter::start() { - // Do not allow reuse - invariant(!_shutdownRequested); - - // Pre-allocate the journal buffers and push them on the ready queue - for (size_t i = 0; i < _readyQueue.maxSize(); i++) { - _readyQueue.push(new Buffer(InitialBufferSizeBytes)); - } - - // Start the thread - stdx::thread t([this] { _journalWriterThread(); }); - _journalWriterThreadHandle.swap(t); -} - -void JournalWriter::shutdown() { - // There is no reason to call shutdown multiple times - invariant(!_shutdownRequested); - _shutdownRequested = true; - - // Never terminate the journal writer with outstanding or unaccounted writes - assertIdle(); - - Buffer* const shutdownBuffer = newBuffer(); - shutdownBuffer->_setShutdown(); - - // This will terminate the journal thread. No need to specify commit number, since we are - // shutting down and nothing will be notified anyways. - writeBuffer(shutdownBuffer, 0); - - // Ensure the journal thread has stopped and everything accounted for. - _journalWriterThreadHandle.join(); - assertIdle(); - - // Delete the buffers (this deallocates the journal buffer memory) - while (!_readyQueue.empty()) { - Buffer* const buffer = _readyQueue.blockingPop(); - delete buffer; - } -} - -void JournalWriter::assertIdle() { - // All buffers are in the ready queue means there is nothing pending. - invariant(_journalQueue.empty()); - invariant(_readyQueue.count() == _readyQueue.maxSize()); -} - -JournalWriter::Buffer* JournalWriter::newBuffer() { - Buffer* const buffer = _readyQueue.blockingPop(); - buffer->_assertEmpty(); - - return buffer; -} - -void JournalWriter::writeBuffer(Buffer* buffer, CommitNotifier::When commitNumber) { - invariant(buffer->_commitNumber == 0); - invariant((commitNumber > _lastCommitNumber) || (buffer->_isShutdown && (commitNumber == 0))); - - buffer->_commitNumber = commitNumber; - - _journalQueue.push(buffer); -} - -void JournalWriter::flush() { - std::vector<Buffer*> buffers; - - // Pop the expected number of buffers from the ready queue. This will block until all - // in-progress buffers have completed. - for (size_t i = 0; i < _readyQueue.maxSize(); i++) { - buffers.push_back(_readyQueue.blockingPop()); - } - - // Put them back in to restore the original state. - for (size_t i = 0; i < buffers.size(); i++) { - _readyQueue.push(buffers[i]); - } -} - -void JournalWriter::_journalWriterThread() { - Client::initThread("journal writer"); - - log() << "Journal writer thread started"; - - try { - while (true) { - Buffer* const buffer = [&] { - MONGO_IDLE_THREAD_BLOCK; - return _journalQueue.blockingPop(); - }(); - - BufferGuard bufferGuard(buffer, &_readyQueue); - - if (buffer->_isShutdown) { - invariant(buffer->_builder.len() == 0); - - // The journal writer thread is terminating. Nothing to notify or write. - break; - } - - if (buffer->_isNoop) { - invariant(buffer->_builder.len() == 0); - - // There's nothing to be writen, but we still need to notify this commit number - _commitNotify->notifyAll(buffer->_commitNumber); - _applyToDataFilesNotify->notifyAll(buffer->_commitNumber); - continue; - } - - LOG(4) << "Journaling commit number " << buffer->_commitNumber << " (journal file " - << buffer->_header.fileId << ", sequence " << buffer->_header.seqNumber - << ", size " << buffer->_builder.len() << " bytes)"; - - // This performs synchronous I/O to the journal file and will block. - WRITETOJOURNAL(buffer->_header, buffer->_builder); - - // Data is now persisted in the journal, which is sufficient for acknowledging - // durability. - dur::getJournalListener()->onDurable(buffer->journalListenerToken); - _commitNotify->notifyAll(buffer->_commitNumber); - - // Apply the journal entries on top of the shared view so that when flush is - // requested it would write the latest. - WRITETODATAFILES(cc().makeOperationContext().get(), buffer->_header, buffer->_builder); - - // Data is now persisted on the shared view, so notify any potential journal file - // cleanup waiters. - _applyToDataFilesNotify->notifyAll(buffer->_commitNumber); - } - } catch (const DBException& e) { - severe() << "dbexception in journalWriterThread causing immediate shutdown: " << redact(e); - MONGO_UNREACHABLE; - } catch (const std::ios_base::failure& e) { - severe() << "ios_base exception in journalWriterThread causing immediate shutdown: " - << e.what(); - MONGO_UNREACHABLE; - } catch (const std::bad_alloc& e) { - severe() << "bad_alloc exception in journalWriterThread causing immediate shutdown: " - << e.what(); - MONGO_UNREACHABLE; - } catch (const std::exception& e) { - severe() << "exception in journalWriterThread causing immediate shutdown: " - << redact(e.what()); - MONGO_UNREACHABLE; - } catch (...) { - severe() << "unhandled exception in journalWriterThread causing immediate shutdown"; - MONGO_UNREACHABLE; - } - - log() << "Journal writer thread stopped"; -} - - -// -// Buffer -// - -JournalWriter::Buffer::Buffer(size_t initialSize) - : _commitNumber(0), _isNoop(false), _isShutdown(false), _header(), _builder(initialSize) {} - -JournalWriter::Buffer::~Buffer() { - _assertEmpty(); -} - -void JournalWriter::Buffer::_assertEmpty() { - invariant(_commitNumber == 0); - invariant(_builder.len() == 0); -} - -void JournalWriter::Buffer::_reset() { - _commitNumber = 0; - _isNoop = false; - _builder.reset(); -} - -} // namespace dur -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/dur_journal_writer.h b/src/mongo/db/storage/mmap_v1/dur_journal_writer.h deleted file mode 100644 index de36e202f81..00000000000 --- a/src/mongo/db/storage/mmap_v1/dur_journal_writer.h +++ /dev/null @@ -1,200 +0,0 @@ -/** - * Copyright (C) 2015 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/base/disallow_copying.h" -#include "mongo/db/storage/journal_listener.h" -#include "mongo/db/storage/mmap_v1/aligned_builder.h" -#include "mongo/db/storage/mmap_v1/commit_notifier.h" -#include "mongo/db/storage/mmap_v1/dur_journalformat.h" -#include "mongo/stdx/thread.h" -#include "mongo/util/queue.h" - -namespace mongo { -namespace dur { - -/** - * Manages the thread and queues used for writing the journal to disk and notify parties with - * are waiting on the write concern. - * - * NOTE: Not thread-safe and must not be used from more than one thread. - */ -class JournalWriter { - MONGO_DISALLOW_COPYING(JournalWriter); - -public: - /** - * Stores the memory and the header for a complete journal buffer which is pending to be - * written by the journal writer thread. - */ - class Buffer { - public: - Buffer(size_t initialSize); - ~Buffer(); - - JSectHeader& getHeader() { - return _header; - } - AlignedBuilder& getBuilder() { - return _builder; - } - - void setNoop() { - _isNoop = true; - } - - JournalListener::Token journalListenerToken; - - private: - friend class BufferGuard; - friend class JournalWriter; - - - void _assertEmpty(); - void _reset(); - void _setShutdown() { - _isShutdown = true; - } - - // Specifies the commit number which flushing this buffer would notify. This value is - // zero, if there is no data to be flushed or if the buffer is noop/shutdown. - CommitNotifier::When _commitNumber; - - // Special buffer that's posted when there is nothing to be written to the journal, - // but we want to order a notification so it happens after all other writes have - // completed. - bool _isNoop; - - // Special buffer that's posted when the receiving thread must terminate. This should - // be the last entry posted to the queue and the commit number should be zero. - bool _isShutdown; - - JSectHeader _header; - AlignedBuilder _builder; - }; - - - /** - * Initializes the journal writer. - * - * @param commitNotify Notification object to be called after journal entries have been - * written to disk. The caller retains ownership and the notify object must outlive - * the journal writer object. - * @param applyToDataFilesNotify Notification object to be called after journal entries - * have been applied to the shared view. This means that if the shared view were to be - * flushed at this point, the journal files before this point are not necessary. The - * caller retains ownership and the notify object must outlive the journal writer - * object. - * @param numBuffers How many buffers to create to hold outstanding writes. If there are - * more than this number of journal writes that have not completed, the write calls - * will block. - */ - JournalWriter(CommitNotifier* commitNotify, - CommitNotifier* applyToDataFilesNotify, - size_t numBuffers); - ~JournalWriter(); - - /** - * Allocates buffer memory and starts the journal writer thread. - */ - void start(); - - /** - * Terminates the journal writer thread and frees memory for the buffers. Must not be - * called if there are any pending journal writes. - */ - void shutdown(); - - /** - * Asserts that there are no pending journal writes. - */ - void assertIdle(); - - /** - * Obtains a new empty buffer into which a journal entry should be written. - * - * This method may block if there are no free buffers. - * - * The caller does not own the buffer and needs to "return" it to the writer by calling - * writeBuffer. Buffers with data on them should never be discarded until they are written. - */ - Buffer* newBuffer(); - - /** - * Requests that the specified buffer be written asynchronously. - * - * This method may block if there are too many outstanding unwritten buffers. - * - * @param buffer Buffer entry to be written. The buffer object must not be used anymore - * after it has been given to this function. - * @param commitNumber What commit number to be notified once the buffer has been written - * to disk. - */ - void writeBuffer(Buffer* buffer, CommitNotifier::When commitNumber); - - /** - * Ensures that all previously submitted write requests complete. This call is blocking. - */ - void flush(); - -private: - friend class BufferGuard; - - typedef BlockingQueue<Buffer*> BufferQueue; - - // Start all buffers with 4MB of size - enum { InitialBufferSizeBytes = 4 * 1024 * 1024 }; - - - void _journalWriterThread(); - - - // This gets notified as journal buffers are written. It is not owned and needs to outlive - // the journal writer object. - CommitNotifier* const _commitNotify; - - // This gets notified as journal buffers are done being applied to the shared view - CommitNotifier* const _applyToDataFilesNotify; - - // Wraps and controls the journal writer thread - stdx::thread _journalWriterThreadHandle; - - // Indicates that shutdown has been requested. Used for idempotency of the shutdown call. - bool _shutdownRequested; - - // Queue of buffers, which need to be written by the journal writer thread - BufferQueue _journalQueue; - CommitNotifier::When _lastCommitNumber; - - // Queue of buffers, whose write has been completed by the journal writer thread. - BufferQueue _readyQueue; -}; - -} // namespace dur -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/dur_journalformat.h b/src/mongo/db/storage/mmap_v1/dur_journalformat.h deleted file mode 100644 index 964c0b79b9b..00000000000 --- a/src/mongo/db/storage/mmap_v1/dur_journalformat.h +++ /dev/null @@ -1,219 +0,0 @@ -// @file dur_journalformat.h The format of our journal files. - -/** -* Copyright (C) 2010 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include <sstream> -#include <string> - -#include "mongo/util/assert_util.h" - -namespace mongo { - -namespace dur { - -const unsigned Alignment = 8192; - -#pragma pack(1) -/** beginning header for a journal/j._<n> file - there is nothing important int this header at this time. except perhaps version #. -*/ -struct JHeader { - JHeader() {} - JHeader(std::string fname); - - // "j\n". j means journal, then a linefeed, fwiw if you were to run "less" on the file or - // something... - char magic[2]; - -// x4142 is asci--readable if you look at the file with head/less -- thus the starting values were -// near that. simply incrementing the version # is safe on a fwd basis. -#if defined(_NOCOMPRESS) - enum { CurrentVersion = 0x4148 }; -#else - enum { CurrentVersion = 0x4149 }; -#endif - unsigned short _version; - - // these are just for diagnostic ease (make header more useful as plain text) - char n1; // '\n' - char ts[20]; // ascii timestamp of file generation. for user reading, not used by code. - char n2; // '\n' - char dbpath[128]; // path/filename of this file for human reading and diagnostics. not used - // by code. - char n3, n4; // '\n', '\n' - - unsigned long long fileId; // unique identifier that will be in each JSectHeader. - // important as we recycle prealloced files - - char reserved3[8026]; // 8KB total for the file header - char txt2[2]; // "\n\n" at the end - - bool versionOk() const { - return _version == CurrentVersion; - } - bool valid() const { - return magic[0] == 'j' && txt2[1] == '\n' && fileId; - } -}; - -/** "Section" header. A section corresponds to a group commit. - len is length of the entire section including header and footer. - header and footer are not compressed, just the stuff in between. -*/ -struct JSectHeader { -private: - unsigned _sectionLen; // unpadded length in bytes of the whole section -public: - unsigned long long - seqNumber; // sequence number that can be used on recovery to not do too much work - unsigned long long fileId; // matches JHeader::fileId - unsigned sectionLen() const { - return _sectionLen; - } - - // we store the unpadded length so we can use that when we uncompress. to - // get the true total size this must be rounded up to the Alignment. - void setSectionLen(unsigned lenUnpadded) { - _sectionLen = lenUnpadded; - } - - unsigned sectionLenWithPadding() const { - unsigned x = (sectionLen() + (Alignment - 1)) & (~(Alignment - 1)); - dassert(x % Alignment == 0); - return x; - } -}; - -/** an individual write operation within a group commit section. Either the entire section should - be applied, or nothing. (We check the md5 for the whole section before doing anything on - recovery.) -*/ -struct JEntry { - enum OpCodes { - OpCode_Footer = 0xffffffff, - OpCode_DbContext = 0xfffffffe, - OpCode_FileCreated = 0xfffffffd, - OpCode_DropDb = 0xfffffffc, - OpCode_Min = 0xfffff000 - }; - union { - unsigned - len; // length in bytes of the data of the JEntry. does not include the JEntry header - OpCodes opcode; - }; - - unsigned ofs; // offset in file - - // sentinel and masks for _fileNo - enum { - DotNsSuffix = 0x7fffffff, // ".ns" file - LocalDbBit = 0x80000000 // assuming "local" db instead of using the JDbContext - }; - int _fileNo; // high bit is set to indicate it should be the <dbpath>/local database - // char data[len] follows - - const char* srcData() const { - const int* i = &_fileNo; - return (const char*)(i + 1); - } - - int getFileNo() const { - return _fileNo & (~LocalDbBit); - } - void setFileNo(int f) { - _fileNo = f; - } - bool isNsSuffix() const { - return getFileNo() == DotNsSuffix; - } - - void setLocalDbContextBit() { - _fileNo |= LocalDbBit; - } - bool isLocalDbContext() const { - return _fileNo & LocalDbBit; - } - void clearLocalDbContextBit() { - _fileNo = getFileNo(); - } - - static std::string suffix(int fileno) { - if (fileno == DotNsSuffix) - return "ns"; - std::stringstream ss; - ss << fileno; - return ss.str(); - } -}; - -/** group commit section footer. md5 is a key field. */ -struct JSectFooter { - JSectFooter(); - JSectFooter(const void* begin, int len); // needs buffer to compute hash - unsigned sentinel; - unsigned char hash[16]; - unsigned long long reserved; - char magic[4]; // "\n\n\n\n" - - /** used by recovery to see if buffer is valid - @param begin the buffer - @param len buffer len - @return true if buffer looks valid - */ - bool checkHash(const void* begin, int len) const; - - bool magicOk() const { - return *((unsigned*)magic) == 0x0a0a0a0a; - } -}; - -/** declares "the next entry(s) are for this database / file path prefix" */ -struct JDbContext { - JDbContext() : sentinel(JEntry::OpCode_DbContext) {} - const unsigned sentinel; // compare to JEntry::len -- zero is our sentinel - // char dbname[]; -}; - -/** "last sequence number" */ -struct LSNFile { - unsigned ver; - unsigned reserved2; - unsigned long long lsn; - unsigned long long checkbytes; - unsigned long long reserved[8]; - - void set(unsigned long long lsn); - unsigned long long get(); -}; - -#pragma pack() -} -} diff --git a/src/mongo/db/storage/mmap_v1/dur_journalimpl.h b/src/mongo/db/storage/mmap_v1/dur_journalimpl.h deleted file mode 100644 index 9a4d22fa826..00000000000 --- a/src/mongo/db/storage/mmap_v1/dur_journalimpl.h +++ /dev/null @@ -1,130 +0,0 @@ -// @file dur_journal.h - -/** -* Copyright (C) 2010 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include <boost/filesystem/path.hpp> - -#include "mongo/db/storage/mmap_v1/aligned_builder.h" -#include "mongo/db/storage/mmap_v1/dur_journalformat.h" -#include "mongo/db/storage/mmap_v1/logfile.h" -#include "mongo/platform/atomic_word.h" -#include "mongo/stdx/mutex.h" -#include "mongo/util/concurrency/mutex.h" - -namespace mongo { - -class ClockSource; - -namespace dur { - -/** the writeahead journal for durability */ -class Journal { -public: - std::string dir; // set by journalMakeDir() during initialization - - Journal(); - - /** call during startup by journalMakeDir() */ - void init(ClockSource* cs, int64_t serverStartMs); - - /** check if time to rotate files. assure a file is open. - done separately from the journal() call as we can do this part - outside of lock. - thread: durThread() - */ - void rotate(); - - /** append to the journal file - */ - void journal(const JSectHeader& h, const AlignedBuilder& b); - - boost::filesystem::path getFilePathFor(int filenumber) const; - - void cleanup(bool log); // closes and removes journal files - - unsigned long long curFileId() const { - return _curFileId; - } - - void assureLogFileOpen() { - stdx::lock_guard<SimpleMutex> lk(_curLogFileMutex); - if (_curLogFile == 0) - _open(); - } - - /** open a journal file to journal operations to. */ - void open(); - -private: - /** check if time to rotate files. assure a file is open. - * internally called with every commit - */ - void _rotate(unsigned long long lsnOfCurrentJournalEntry); - - void _open(); - void closeCurrentJournalFile(); - void removeUnneededJournalFiles(); - - unsigned long long _written = 0; // bytes written so far to the current journal (log) file - unsigned _nextFileNumber = 0; - - SimpleMutex _curLogFileMutex; - - LogFile* _curLogFile; // use _curLogFileMutex - unsigned long long _curFileId; // current file id see JHeader::fileId - - struct JFile { - std::string filename; - unsigned long long lastEventTimeMs; - }; - - // files which have been closed but not unlinked (rotated out) yet - // ordered oldest to newest - std::list<JFile> _oldJournalFiles; // use _curLogFileMutex - - // lsn related - friend void setLastSeqNumberWrittenToSharedView(uint64_t seqNumber); - friend void notifyPreDataFileFlush(); - friend void notifyPostDataFileFlush(); - void updateLSNFile(unsigned long long lsnOfCurrentJournalEntry); - // data <= this time is in the shared view - AtomicUInt64 _lastSeqNumberWrittenToSharedView; - // data <= this time was in the shared view when the last flush to start started - AtomicUInt64 _preFlushTime; - // data <= this time is fsynced in the datafiles (unless hard drive controller is caching) - AtomicUInt64 _lastFlushTime; - AtomicWord<bool> _writeToLSNNeeded; - - ClockSource* _clock; - int64_t _serverStartMs; -}; -} -} diff --git a/src/mongo/db/storage/mmap_v1/dur_preplogbuffer.cpp b/src/mongo/db/storage/mmap_v1/dur_preplogbuffer.cpp deleted file mode 100644 index d31b883b9c7..00000000000 --- a/src/mongo/db/storage/mmap_v1/dur_preplogbuffer.cpp +++ /dev/null @@ -1,209 +0,0 @@ -/** - * Copyright (C) 2009-2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -/* - PREPLOGBUFFER - we will build an output buffer ourself and then use O_DIRECT - we could be in read lock for this - for very large objects write directly to redo log in situ? - @see https://docs.google.com/drawings/edit?id=1TklsmZzm7ohIZkwgeK6rMvsdaR13KjtJYMsfLr175Zc -*/ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/aligned_builder.h" -#include "mongo/db/storage/mmap_v1/dur_commitjob.h" -#include "mongo/db/storage/mmap_v1/dur_journal.h" -#include "mongo/db/storage/mmap_v1/dur_journalimpl.h" -#include "mongo/db/storage/mmap_v1/dur_stats.h" -#include "mongo/db/storage/mmap_v1/durable_mapped_file.h" -#include "mongo/db/storage/storage_options.h" -#include "mongo/stdx/thread.h" -#include "mongo/util/clock_source.h" -#include "mongo/util/log.h" -#include "mongo/util/mongoutils/str.h" -#include "mongo/util/stacktrace.h" -#include "mongo/util/timer.h" - -namespace mongo { - -using std::endl; -using std::min; -using std::stringstream; - -namespace dur { - -extern Journal j; -extern CommitJob commitJob; - -const RelativePath local = RelativePath::fromRelativePath("local"); - -static DurableMappedFile* findMMF_inlock(void* ptr, size_t& ofs) { - DurableMappedFile* f = privateViews.find_inlock(ptr, ofs); - if (f == 0) { - error() << "findMMF_inlock failed " << privateViews.numberOfViews_inlock() << endl; - - // we want a stack trace and the assert below didn't print a trace once in the real world - // - not sure why - printStackTrace(); - stringstream ss; - ss << "view pointer cannot be resolved " << std::hex << (size_t)ptr; - journalingFailure(ss.str().c_str()); // asserts, which then abends - } - return f; -} - -/** put the basic write operation into the buffer (bb) to be journaled */ -static void prepBasicWrite_inlock(AlignedBuilder& bb, - const WriteIntent* i, - RelativePath& lastDbPath) { - size_t ofs = 1; - DurableMappedFile* mmf = findMMF_inlock(i->start(), /*out*/ ofs); - - if (MONGO_unlikely(!mmf->willNeedRemap())) { - // tag this mmf as needed a remap of its private view later. - // usually it will already be dirty/already set, so we do the if above first - // to avoid possibility of cpu cache line contention - mmf->setWillNeedRemap(); - } - - // since we have already looked up the mmf, we go ahead and remember the write view location - // so we don't have to find the DurableMappedFile again later in WRITETODATAFILES() - // - // this was for WRITETODATAFILES_Impl2 so commented out now - // - /* - dassert( i->w_ptr == 0 ); - i->w_ptr = ((char*)mmf->view_write()) + ofs; - */ - - JEntry e; - e.len = min(i->length(), (unsigned)(mmf->length() - ofs)); // don't write past end of file - verify(ofs <= 0x80000000); - e.ofs = (unsigned)ofs; - e.setFileNo(mmf->fileSuffixNo()); - - if (mmf->relativePath() == local) { - e.setLocalDbContextBit(); - } else if (mmf->relativePath() != lastDbPath) { - lastDbPath = mmf->relativePath(); - JDbContext c; - bb.appendStruct(c); - bb.appendStr(lastDbPath.toString()); - } - - bb.appendStruct(e); - bb.appendBuf(i->start(), e.len); - - if (MONGO_unlikely(e.len != (unsigned)i->length())) { - log() << "journal info splitting prepBasicWrite at boundary" << endl; - - // This only happens if we write to the last byte in a file and - // the fist byte in another file that is mapped adjacently. I - // think most OSs leave at least a one page gap between - // mappings, but better to be safe. - - WriteIntent next((char*)i->start() + e.len, i->length() - e.len); - prepBasicWrite_inlock(bb, &next, lastDbPath); - } -} - -/** basic write ops / write intents. note there is no particular order to these : if we have - two writes to the same location during the group commit interval, it is likely - (although not assured) that it is journaled here once. -*/ -static void prepBasicWrites(AlignedBuilder& bb, const std::vector<WriteIntent>& intents) { - stdx::lock_guard<stdx::mutex> lk(privateViews._mutex()); - - // Each time write intents switch to a different database we journal a JDbContext. - // Switches will be rare as we sort by memory location first and we batch commit. - RelativePath lastDbPath; - - invariant(!intents.empty()); - - WriteIntent last; - for (std::vector<WriteIntent>::const_iterator i = intents.begin(); i != intents.end(); i++) { - if (i->start() < last.end()) { - // overlaps - last.absorb(*i); - } else { - // discontinuous - if (i != intents.begin()) { - prepBasicWrite_inlock(bb, &last, lastDbPath); - } - - last = *i; - } - } - - prepBasicWrite_inlock(bb, &last, lastDbPath); -} - -/** we will build an output buffer ourself and then use O_DIRECT - we could be in read lock for this - caller handles locking - @return partially populated sectheader and _ab set -*/ -static void _PREPLOGBUFFER(JSectHeader& h, - AlignedBuilder& bb, - ClockSource* cs, - int64_t serverStartMs) { - // Add the JSectHeader - - // Invalidate the total length, we will fill it in later. - h.setSectionLen(0xffffffff); - h.seqNumber = generateNextSeqNumber(cs, serverStartMs); - h.fileId = j.curFileId(); - - // Ops other than basic writes (DurOp's) go first - const std::vector<std::shared_ptr<DurOp>>& durOps = commitJob.ops(); - for (std::vector<std::shared_ptr<DurOp>>::const_iterator i = durOps.begin(); i != durOps.end(); - i++) { - (*i)->serialize(bb); - } - - // Write intents - const std::vector<WriteIntent>& intents = commitJob.getIntentsSorted(); - if (!intents.empty()) { - prepBasicWrites(bb, intents); - } -} - -void PREPLOGBUFFER(/*out*/ JSectHeader& outHeader, - AlignedBuilder& outBuffer, - ClockSource* cs, - int64_t serverStartMs) { - Timer t; - j.assureLogFileOpen(); // so fileId is set - _PREPLOGBUFFER(outHeader, outBuffer, cs, serverStartMs); - stats.curr()->_prepLogBufferMicros += t.micros(); -} -} -} diff --git a/src/mongo/db/storage/mmap_v1/dur_recover.cpp b/src/mongo/db/storage/mmap_v1/dur_recover.cpp deleted file mode 100644 index 936766f0160..00000000000 --- a/src/mongo/db/storage/mmap_v1/dur_recover.cpp +++ /dev/null @@ -1,682 +0,0 @@ -// @file dur_recover.cpp crash recovery via the journal - -/** -* Copyright (C) 2009 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kJournal - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/dur_recover.h" - -#include <cstring> -#include <fcntl.h> -#include <iomanip> -#include <iostream> -#include <sys/stat.h> - -#include "mongo/db/client.h" -#include "mongo/db/concurrency/d_concurrency.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/storage/mmap_v1/compress.h" -#include "mongo/db/storage/mmap_v1/dur_commitjob.h" -#include "mongo/db/storage/mmap_v1/dur_journal.h" -#include "mongo/db/storage/mmap_v1/dur_journalformat.h" -#include "mongo/db/storage/mmap_v1/dur_stats.h" -#include "mongo/db/storage/mmap_v1/durable_mapped_file.h" -#include "mongo/db/storage/mmap_v1/durop.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" -#include "mongo/platform/strnlen.h" -#include "mongo/util/bufreader.h" -#include "mongo/util/checksum.h" -#include "mongo/util/destructor_guard.h" -#include "mongo/util/exit.h" -#include "mongo/util/hex.h" -#include "mongo/util/log.h" -#include "mongo/util/mongoutils/str.h" -#include "mongo/util/scopeguard.h" -#include "mongo/util/startup_test.h" - -namespace mongo { - -using std::shared_ptr; -using std::unique_ptr; -using std::endl; -using std::hex; -using std::map; -using std::pair; -using std::setw; -using std::string; -using std::stringstream; -using std::vector; - -/** - * Thrown when a journal section is corrupt. This is considered OK as long as it occurs while - * processing the last file. Processing stops at the first corrupt section. - * - * Any logging about the nature of the corruption should happen before throwing as this class - * contains no data. - */ -class JournalSectionCorruptException {}; - -namespace dur { - -// The singleton recovery job object -RecoveryJob& RecoveryJob::_instance = *(new RecoveryJob()); - - -void removeJournalFiles(); -boost::filesystem::path getJournalDir(); - - -struct ParsedJournalEntry { /*copyable*/ - ParsedJournalEntry() : e(0) {} - - // relative path of database for the operation. - // might be a pointer into mmaped Journal file - const char* dbName; - - // those are pointers into the memory mapped journal file - const JEntry* e; // local db sentinel is already parsed out here into dbName - - // if not one of the two simple JEntry's above, this is the operation: - std::shared_ptr<DurOp> op; -}; - - -/** - * Get journal filenames, in order. Throws if unexpected content found. - */ -static void getFiles(boost::filesystem::path dir, vector<boost::filesystem::path>& files) { - map<unsigned, boost::filesystem::path> m; - for (boost::filesystem::directory_iterator i(dir); i != boost::filesystem::directory_iterator(); - ++i) { - boost::filesystem::path filepath = *i; - string fileName = boost::filesystem::path(*i).leaf().string(); - if (str::startsWith(fileName, "j._")) { - unsigned u = str::toUnsigned(str::after(fileName, '_')); - if (m.count(u)) { - uasserted(13531, - str::stream() << "unexpected files in journal directory " << dir.string() - << " : " - << fileName); - } - m.insert(pair<unsigned, boost::filesystem::path>(u, filepath)); - } - } - for (map<unsigned, boost::filesystem::path>::iterator i = m.begin(); i != m.end(); ++i) { - if (i != m.begin() && m.count(i->first - 1) == 0) { - uasserted(13532, - str::stream() << "unexpected file in journal directory " << dir.string() - << " : " - << boost::filesystem::path(i->second).leaf().string() - << " : can't find its preceding file"); - } - files.push_back(i->second); - } -} - -/** read through the memory mapped data of a journal file (journal/j._<n> file) - throws -*/ -class JournalSectionIterator { - MONGO_DISALLOW_COPYING(JournalSectionIterator); - -public: - JournalSectionIterator(const JSectHeader& h, - const void* compressed, - unsigned compressedLen, - bool doDurOpsRecovering) - : _h(h), _lastDbName(0), _doDurOps(doDurOpsRecovering) { - verify(doDurOpsRecovering); - - if (!uncompress((const char*)compressed, compressedLen, &_uncompressed)) { - // We check the checksum before we uncompress, but this may still fail as the - // checksum isn't foolproof. - log() << "couldn't uncompress journal section" << endl; - throw JournalSectionCorruptException(); - } - - const char* p = _uncompressed.c_str(); - verify(compressedLen == _h.sectionLen() - sizeof(JSectFooter) - sizeof(JSectHeader)); - - _entries = unique_ptr<BufReader>(new BufReader(p, _uncompressed.size())); - } - - // We work with the uncompressed buffer when doing a WRITETODATAFILES (for speed) - JournalSectionIterator(const JSectHeader& h, const void* p, unsigned len) - : _entries(new BufReader((const char*)p, len)), _h(h), _lastDbName(0), _doDurOps(false) {} - - bool atEof() const { - return _entries->atEof(); - } - - unsigned long long seqNumber() const { - return _h.seqNumber; - } - - /** get the next entry from the log. this function parses and combines JDbContext and JEntry's. - * throws on premature end of section. - */ - void next(ParsedJournalEntry& e) { - unsigned lenOrOpCode{}; - _entries->read(lenOrOpCode); - - if (lenOrOpCode > JEntry::OpCode_Min) { - switch (lenOrOpCode) { - case JEntry::OpCode_Footer: { - verify(false); - } - - case JEntry::OpCode_FileCreated: - case JEntry::OpCode_DropDb: { - e.dbName = 0; - std::shared_ptr<DurOp> op = DurOp::read(lenOrOpCode, *_entries); - if (_doDurOps) { - e.op = op; - } - return; - } - - case JEntry::OpCode_DbContext: { - _lastDbName = (const char*)_entries->pos(); - const unsigned limit = _entries->remaining(); - const unsigned len = strnlen(_lastDbName, limit); - if (_lastDbName[len] != '\0') { - log() << "problem processing journal file during recovery"; - throw JournalSectionCorruptException(); - } - - _entries->skip(len + 1); // skip '\0' too - _entries->read(lenOrOpCode); // read this for the fall through - } - // fall through as a basic operation always follows jdbcontext, and we don't have - // anything to return yet - - default: - // fall through - ; - } - } - - // JEntry - a basic write - verify(lenOrOpCode && lenOrOpCode < JEntry::OpCode_Min); - _entries->rewind(4); - e.e = (JEntry*)_entries->skip(sizeof(JEntry)); - e.dbName = e.e->isLocalDbContext() ? "local" : _lastDbName; - verify(e.e->len == lenOrOpCode); - _entries->skip(e.e->len); - } - - -private: - unique_ptr<BufReader> _entries; - const JSectHeader _h; - const char* _lastDbName; // pointer into mmaped journal file - const bool _doDurOps; - string _uncompressed; -}; - - -static string fileName(const char* dbName, int fileNo) { - stringstream ss; - ss << dbName << '.'; - verify(fileNo >= 0); - if (fileNo == JEntry::DotNsSuffix) - ss << "ns"; - else - ss << fileNo; - - // relative name -> full path name - boost::filesystem::path full(storageGlobalParams.dbpath); - full /= ss.str(); - return full.string(); -} - - -RecoveryJob::RecoveryJob() - : _recovering(false), - _lastDataSyncedFromLastRun(0), - _lastSeqSkipped(0), - _appliedAnySections(false) {} - -#pragma warning(push) -// C4722: 'mongo::dur::RecoveryJob::~RecoveryJob': destructor never returns, potential memory leak -#pragma warning(disable : 4722) -RecoveryJob::~RecoveryJob() { - invariant(!"RecoveryJob is intentionally leaked with a bare call to operator new()"); -} -#pragma warning(pop) - -void RecoveryJob::close(OperationContext* opCtx) { - stdx::lock_guard<stdx::mutex> lk(_mx); - _close(opCtx); -} - -void RecoveryJob::_close(OperationContext* opCtx) { - MongoFile::flushAll(opCtx, true); - LockMongoFilesExclusive lock(opCtx); - for (auto& durFile : _mmfs) { - durFile->close(opCtx); - } - _mmfs.clear(); -} - -RecoveryJob::Last::Last(OperationContext* opCtx) : _opCtx(opCtx), mmf(NULL), fileNo(-1) { - // Make sure the files list does not change from underneath - LockMongoFilesShared::assertAtLeastReadLocked(opCtx); -} - -DurableMappedFile* RecoveryJob::Last::newEntry(const dur::ParsedJournalEntry& entry, - RecoveryJob& rj) { - int num = entry.e->getFileNo(); - if (num == fileNo && entry.dbName == dbName) - return mmf; - - string fn = fileName(entry.dbName, num); - MongoFile* file; - { - MongoFileFinder finder(_opCtx); // must release lock before creating new DurableMappedFile - file = finder.findByPath(fn); - } - - if (file) { - verify(file->isDurableMappedFile()); - mmf = (DurableMappedFile*)file; - } else { - if (!rj._recovering) { - log() << "journal error applying writes, file " << fn << " is not open" << endl; - verify(false); - } - std::shared_ptr<DurableMappedFile> sp(new DurableMappedFile(_opCtx)); - verify(sp->open(_opCtx, fn)); - rj._mmfs.push_back(sp); - mmf = sp.get(); - } - - // we do this last so that if an exception were thrown, there isn't any wrong memory - dbName = entry.dbName; - fileNo = num; - return mmf; -} - -void RecoveryJob::write(Last& last, const ParsedJournalEntry& entry) { - // TODO(mathias): look into making some of these dasserts - verify(entry.e); - verify(entry.dbName); - - DurableMappedFile* mmf = last.newEntry(entry, *this); - - if ((entry.e->ofs + entry.e->len) <= mmf->length()) { - verify(mmf->view_write()); - verify(entry.e->srcData()); - - void* dest = (char*)mmf->view_write() + entry.e->ofs; - memcpy(dest, entry.e->srcData(), entry.e->len); - stats.curr()->_writeToDataFilesBytes += entry.e->len; - } else { - massert(13622, "Trying to write past end of file in WRITETODATAFILES", _recovering); - } -} - -void RecoveryJob::applyEntry(Last& last, const ParsedJournalEntry& entry, bool apply, bool dump) { - if (entry.e) { - if (dump) { - stringstream ss; - ss << " BASICWRITE " << setw(20) << entry.dbName << '.'; - if (entry.e->isNsSuffix()) - ss << "ns"; - else - ss << setw(2) << entry.e->getFileNo(); - ss << ' ' << setw(6) << entry.e->len << ' ' - << /*hex << setw(8) << (size_t) fqe.srcData << dec <<*/ - " " << redact(hexdump(entry.e->srcData(), entry.e->len)); - log() << ss.str() << endl; - } - if (apply) { - write(last, entry); - } - } else if (entry.op) { - // a DurOp subclass operation - if (dump) { - log() << " OP " << redact(entry.op->toString()) << endl; - } - if (apply) { - if (entry.op->needFilesClosed()) { - _close(last.opCtx()); // locked in processSection - } - entry.op->replay(); - } - } -} - -void RecoveryJob::applyEntries(OperationContext* opCtx, const vector<ParsedJournalEntry>& entries) { - const bool apply = (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalScanOnly) == 0; - const bool dump = (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalDumpJournal); - - if (dump) { - log() << "BEGIN section" << endl; - } - - Last last(opCtx); - for (vector<ParsedJournalEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i) { - applyEntry(last, *i, apply, dump); - } - - if (dump) { - log() << "END section" << endl; - } -} - -void RecoveryJob::processSection(OperationContext* opCtx, - const JSectHeader* h, - const void* p, - unsigned len, - const JSectFooter* f) { - LockMongoFilesShared lkFiles(opCtx); // for RecoveryJob::Last - stdx::lock_guard<stdx::mutex> lk(_mx); - - if (_recovering) { - // Check the footer checksum before doing anything else. - verify(((const char*)h) + sizeof(JSectHeader) == p); - if (!f->checkHash(h, len + sizeof(JSectHeader))) { - log() << "journal section checksum doesn't match"; - throw JournalSectionCorruptException(); - } - - static uint64_t numJournalSegmentsSkipped = 0; - static const uint64_t kMaxSkippedSectionsToLog = 10; - if (_lastDataSyncedFromLastRun > h->seqNumber + ExtraKeepTimeMs) { - if (_appliedAnySections) { - severe() << "Journal section sequence number " << h->seqNumber - << " is lower than the threshold for applying (" - << h->seqNumber + ExtraKeepTimeMs - << ") but we have already applied some journal sections. This implies a " - << "corrupt journal file."; - fassertFailed(34369); - } - - if (++numJournalSegmentsSkipped < kMaxSkippedSectionsToLog) { - log() << "recover skipping application of section seq:" << h->seqNumber - << " < lsn:" << _lastDataSyncedFromLastRun << endl; - } else if (numJournalSegmentsSkipped == kMaxSkippedSectionsToLog) { - log() << "recover skipping application of section more..." << endl; - } - _lastSeqSkipped = h->seqNumber; - return; - } - - if (!_appliedAnySections) { - _appliedAnySections = true; - if (numJournalSegmentsSkipped >= kMaxSkippedSectionsToLog) { - // Log the last skipped section's sequence number if it hasn't been logged before. - log() << "recover final skipped journal section had sequence number " - << _lastSeqSkipped; - } - log() << "recover applying initial journal section with sequence number " - << h->seqNumber; - } - } - - unique_ptr<JournalSectionIterator> i; - if (_recovering) { - i = unique_ptr<JournalSectionIterator>(new JournalSectionIterator(*h, p, len, _recovering)); - } else { - i = unique_ptr<JournalSectionIterator>( - new JournalSectionIterator(*h, /*after header*/ p, /*w/out header*/ len)); - } - - // we use a static so that we don't have to reallocate every time through. occasionally we - // go back to a small allocation so that if there were a spiky growth it won't stick forever. - static vector<ParsedJournalEntry> entries; - entries.clear(); - /** TEMP uncomment - RARELY OCCASIONALLY { - if( entries.capacity() > 2048 ) { - entries.shrink_to_fit(); - entries.reserve(2048); - } - } - */ - - // first read all entries to make sure this section is valid - ParsedJournalEntry e; - while (!i->atEof()) { - i->next(e); - entries.push_back(e); - } - - // got all the entries for one group commit. apply them: - applyEntries(opCtx, entries); -} - -/** apply a specific journal file, that is already mmap'd - @param p start of the memory mapped file - @return true if this is detected to be the last file (ends abruptly) -*/ -bool RecoveryJob::processFileBuffer(OperationContext* opCtx, const void* p, unsigned len) { - try { - unsigned long long fileId; - BufReader br(p, len); - - { - // read file header - JHeader h; - std::memset(&h, 0, sizeof(h)); - - br.read(h); - - if (!h.valid()) { - log() << "Journal file header invalid. This could indicate corruption, or " - << "an unclean shutdown while writing the first section in a journal " - << "file."; - throw JournalSectionCorruptException(); - } - - if (!h.versionOk()) { - log() << "journal file version number mismatch got:" << hex << h._version - << " expected:" << hex << (unsigned)JHeader::CurrentVersion - << ". if you have just upgraded, recover with old version of mongod, " - "terminate cleanly, then upgrade." - << endl; - // Not using JournalSectionCurruptException as we don't want to ignore - // journal files on upgrade. - uasserted(13536, str::stream() << "journal version number mismatch " << h._version); - } - fileId = h.fileId; - if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalDumpJournal) { - log() << "JHeader::fileId=" << fileId << endl; - } - } - - // read sections - while (!br.atEof()) { - JSectHeader h; - std::memset(&h, 0, sizeof(h)); - - br.peek(h); - if (h.fileId != fileId) { - if (kDebugBuild || - (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalDumpJournal)) { - log() << "Ending processFileBuffer at differing fileId want:" << fileId - << " got:" << h.fileId << endl; - log() << " sect len:" << h.sectionLen() << " seqnum:" << h.seqNumber << endl; - } - return true; - } - unsigned slen = h.sectionLen(); - unsigned dataLen = slen - sizeof(JSectHeader) - sizeof(JSectFooter); - const char* hdr = (const char*)br.skip(h.sectionLenWithPadding()); - const char* data = hdr + sizeof(JSectHeader); - const char* footer = data + dataLen; - processSection( - opCtx, (const JSectHeader*)hdr, data, dataLen, (const JSectFooter*)footer); - - // ctrl c check - uassert(ErrorCodes::Interrupted, - "interrupted during journal recovery", - !globalInShutdownDeprecated()); - } - } catch (const DBException& ex) { - if (ex.code() != ErrorCodes::Overflow) - throw; // Only ignore errors related to the file abruptly ending. - - if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalDumpJournal) - log() << "ABRUPT END" << endl; - return true; // abrupt end - } catch (const JournalSectionCorruptException&) { - if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalDumpJournal) - log() << "ABRUPT END" << endl; - return true; // abrupt end - } - - return false; // non-abrupt end -} - -/** apply a specific journal file */ -bool RecoveryJob::processFile(OperationContext* opCtx, boost::filesystem::path journalfile) { - log() << "recover " << journalfile.string() << endl; - - try { - if (boost::filesystem::file_size(journalfile.string()) == 0) { - log() << "recover info " << journalfile.string() << " has zero length" << endl; - return true; - } - } catch (...) { - // if something weird like a permissions problem keep going so the massert down below can - // happen (presumably) - log() << "recover exception checking filesize" << endl; - } - - MemoryMappedFile f{opCtx, MongoFile::Options::READONLY | MongoFile::Options::SEQUENTIAL}; - ON_BLOCK_EXIT([&f, &opCtx] { - LockMongoFilesExclusive lock(opCtx); - f.close(opCtx); - }); - void* p = f.map(opCtx, journalfile.string().c_str()); - massert(13544, str::stream() << "recover error couldn't open " << journalfile.string(), p); - return processFileBuffer(opCtx, p, (unsigned)f.length()); -} - -/** @param files all the j._0 style files we need to apply for recovery */ -void RecoveryJob::go(OperationContext* opCtx, vector<boost::filesystem::path>& files) { - log() << "recover begin" << endl; - LockMongoFilesExclusive lkFiles(opCtx); // for RecoveryJob::Last - _recovering = true; - - // load the last sequence number synced to the datafiles on disk before the last crash - _lastDataSyncedFromLastRun = journalReadLSN(); - log() << "recover lsn: " << _lastDataSyncedFromLastRun << endl; - - for (unsigned i = 0; i != files.size(); ++i) { - bool abruptEnd = processFile(opCtx, files[i]); - if (abruptEnd && i + 1 < files.size()) { - log() << "recover error: abrupt end to file " << files[i].string() - << ", yet it isn't the last journal file" << endl; - close(opCtx); - uasserted(13535, "recover abrupt journal file end"); - } - } - - if (_lastSeqSkipped && !_appliedAnySections) { - log() << "recover journal replay completed without applying any sections. " - << "This can happen if there were no writes after the last fsync of the data files. " - << "Last skipped sections had sequence number " << _lastSeqSkipped; - } - - close(opCtx); - - if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalScanOnly) { - uasserted(13545, - str::stream() << "--durOptions " << (int)MMAPV1Options::JournalScanOnly - << " (scan only) specified"); - } - - log() << "recover cleaning up" << endl; - removeJournalFiles(); - log() << "recover done" << endl; - okToCleanUp = true; - _recovering = false; -} - -void _recover(OperationContext* opCtx) { - verify(storageGlobalParams.dur); - - boost::filesystem::path p = getJournalDir(); - if (!exists(p)) { - log() << "directory " << p.string() - << " does not exist, there will be no recovery startup step" << endl; - okToCleanUp = true; - return; - } - - vector<boost::filesystem::path> journalFiles; - getFiles(p, journalFiles); - - if (journalFiles.empty()) { - log() << "recover : no journal files present, no recovery needed" << endl; - okToCleanUp = true; - return; - } - - RecoveryJob::get().go(opCtx, journalFiles); -} - -/** recover from a crash - called during startup - throws on error -*/ -void replayJournalFilesAtStartup() { - // we use a lock so that exitCleanly will wait for us - // to finish (or at least to notice what is up and stop) - auto opCtx = cc().makeOperationContext(); - Lock::GlobalWrite lk(opCtx.get()); - - _recover(opCtx.get()); // throws on interruption -} - -struct BufReaderY { - int a, b; -}; -class BufReaderUnitTest : public StartupTest { - -public: - void run() { - BufReader r((void*)"abcdabcdabcd", 12); - char x; - BufReaderY y; - r.read(x); // cout << x; // a - verify(x == 'a'); - r.read(y); - r.read(x); - verify(x == 'b'); - } -} brunittest; - -} // namespace dur -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/dur_recover.h b/src/mongo/db/storage/mmap_v1/dur_recover.h deleted file mode 100644 index 79ce0b03e5d..00000000000 --- a/src/mongo/db/storage/mmap_v1/dur_recover.h +++ /dev/null @@ -1,119 +0,0 @@ -// @file dur.h durability support - -/** -* Copyright (C) 2009 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include <boost/filesystem/operations.hpp> -#include <list> -#include <memory> - -#include "mongo/db/service_context.h" -#include "mongo/db/storage/mmap_v1/dur_journalformat.h" -#include "mongo/stdx/mutex.h" -#include "mongo/util/concurrency/mutex.h" - -namespace mongo { - -class DurableMappedFile; - -namespace dur { - -struct ParsedJournalEntry; - -/** call go() to execute a recovery from existing journal files. - */ -class RecoveryJob { - MONGO_DISALLOW_COPYING(RecoveryJob); - -public: - RecoveryJob(); - ~RecoveryJob(); - - void go(OperationContext* opCtx, std::vector<boost::filesystem::path>& files); - - /** @param data data between header and footer. compressed if recovering. */ - void processSection(OperationContext* opCtx, - const JSectHeader* h, - const void* data, - unsigned len, - const JSectFooter* f); - - // locks and calls _close() - void close(OperationContext* opCtx); - - static RecoveryJob& get() { - return _instance; - } - -private: - class Last { - public: - Last(OperationContext* opCtx); - - DurableMappedFile* newEntry(const ParsedJournalEntry&, RecoveryJob&); - - OperationContext* opCtx() { - return _opCtx; - } - - private: - OperationContext* _opCtx; - DurableMappedFile* mmf; - std::string dbName; - int fileNo; - }; - - - void write(Last& last, const ParsedJournalEntry& entry); // actually writes to the file - void applyEntry(Last& last, const ParsedJournalEntry& entry, bool apply, bool dump); - void applyEntries(OperationContext* opCtx, const std::vector<ParsedJournalEntry>& entries); - bool processFileBuffer(OperationContext* opCtx, const void*, unsigned len); - bool processFile(OperationContext* opCtx, boost::filesystem::path journalfile); - void _close(OperationContext* opCtx); // doesn't lock - - // Set of memory mapped files and a mutex to protect them - stdx::mutex _mx; - std::list<std::shared_ptr<DurableMappedFile>> _mmfs; - - // Are we in recovery or WRITETODATAFILES - bool _recovering; - - unsigned long long _lastDataSyncedFromLastRun; - unsigned long long _lastSeqSkipped; - bool _appliedAnySections; - - - static RecoveryJob& _instance; -}; - - -void replayJournalFilesAtStartup(); -} -} diff --git a/src/mongo/db/storage/mmap_v1/dur_recovery_unit.cpp b/src/mongo/db/storage/mmap_v1/dur_recovery_unit.cpp deleted file mode 100644 index ff5b114975f..00000000000 --- a/src/mongo/db/storage/mmap_v1/dur_recovery_unit.cpp +++ /dev/null @@ -1,316 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/dur_recovery_unit.h" - -#include <algorithm> -#include <limits> -#include <map> -#include <set> -#include <string> - -#include "mongo/db/storage/mmap_v1/dur.h" -#include "mongo/db/storage/mmap_v1/durable_mapped_file.h" -#include "mongo/util/assert_util.h" -#include "mongo/util/log.h" - -namespace mongo { - -DurRecoveryUnit::DurRecoveryUnit() - : _writeCount(0), _writeBytes(0), _inUnitOfWork(false), _rollbackWritesDisabled(false) {} - -void DurRecoveryUnit::beginUnitOfWork(OperationContext* opCtx) { - invariant(!_inUnitOfWork); - _inUnitOfWork = true; -} - -void DurRecoveryUnit::commitUnitOfWork() { - invariant(_inUnitOfWork); - - commitChanges(); - - // global journal flush opportunity - getDur().commitIfNeeded(); - - resetChanges(); -} - -void DurRecoveryUnit::abortUnitOfWork() { - invariant(_inUnitOfWork); - - rollbackChanges(); - resetChanges(); -} - -void DurRecoveryUnit::abandonSnapshot() { - invariant(!_inUnitOfWork); - // no-op since we have no transaction -} - -void DurRecoveryUnit::commitChanges() { - if (getDur().isDurable()) - markWritesForJournaling(); - - try { - for (Changes::const_iterator it = _changes.begin(), end = _changes.end(); it != end; ++it) { - (*it)->commit(boost::none); - } - } catch (...) { - std::terminate(); - } -} - -void DurRecoveryUnit::markWritesForJournaling() { - if (!_writeCount) - return; - - typedef std::pair<void*, unsigned> Intent; - std::vector<Intent> intents; - const size_t numStoredWrites = _initialWrites.size() + _mergedWrites.size(); - intents.reserve(numStoredWrites); - - // Show very large units of work at LOG(1) level as they may hint at performance issues - const int logLevel = (_writeCount > 100 * 1000 || _writeBytes > 50 * 1024 * 1024) ? 1 : 3; - - LOG(logLevel) << _writeCount << " writes (" << _writeBytes / 1024 << " kB) covered by " - << numStoredWrites << " pre-images (" << _preimageBuffer.size() / 1024 << " kB) "; - - // orders the initial, unmerged writes, by address so we can coalesce overlapping and - // adjacent writes - std::sort(_initialWrites.begin(), _initialWrites.end()); - - if (!_initialWrites.empty()) { - intents.push_back(std::make_pair(_initialWrites.front().addr, _initialWrites.front().len)); - for (InitialWrites::iterator it = (_initialWrites.begin() + 1), end = _initialWrites.end(); - it != end; - ++it) { - Intent& lastIntent = intents.back(); - char* lastEnd = static_cast<char*>(lastIntent.first) + lastIntent.second; - if (it->addr <= lastEnd) { - // overlapping or adjacent, so extend. - ptrdiff_t extendedLen = (it->end()) - static_cast<char*>(lastIntent.first); - lastIntent.second = std::max(lastIntent.second, unsigned(extendedLen)); - } else { - // not overlapping, so create a new intent - intents.push_back(std::make_pair(it->addr, it->len)); - } - } - } - - MergedWrites::iterator it = _mergedWrites.begin(); - if (it != _mergedWrites.end()) { - intents.push_back(std::make_pair(it->addr, it->len)); - while (++it != _mergedWrites.end()) { - // Check the property that write intents are sorted and don't overlap. - invariant(it->addr >= intents.back().first); - Intent& lastIntent = intents.back(); - char* lastEnd = static_cast<char*>(lastIntent.first) + lastIntent.second; - if (it->addr == lastEnd) { - // adjacent, so extend. - lastIntent.second += it->len; - } else { - // not overlapping, so create a new intent - invariant(it->addr > lastEnd); - intents.push_back(std::make_pair(it->addr, it->len)); - } - } - } - LOG(logLevel) << _mergedWrites.size() << " pre-images " - << "coalesced into " << intents.size() << " write intents"; - - getDur().declareWriteIntents(intents); -} - -void DurRecoveryUnit::resetChanges() { - _writeCount = 0; - _writeBytes = 0; - _initialWrites.clear(); - _mergedWrites.clear(); - _changes.clear(); - _preimageBuffer.clear(); - _rollbackWritesDisabled = false; - _inUnitOfWork = false; -} - -void DurRecoveryUnit::rollbackChanges() { - // First rollback disk writes, then Changes. This matches behavior in other storage engines - // that either rollback a transaction or don't write a writebatch. - - if (_rollbackWritesDisabled) { - LOG(2) << " ***** NOT ROLLING BACK " << _writeCount << " disk writes"; - } else { - LOG(2) << " ***** ROLLING BACK " << _writeCount << " disk writes"; - - // First roll back the merged writes. These have no overlap or ordering requirement - // other than needing to be rolled back before all _initialWrites. - for (MergedWrites::iterator it = _mergedWrites.begin(); it != _mergedWrites.end(); ++it) { - _preimageBuffer.copy(it->addr, it->len, it->offset); - } - - // Then roll back the initial writes in LIFO order, as these might have overlaps. - for (InitialWrites::reverse_iterator rit = _initialWrites.rbegin(); - rit != _initialWrites.rend(); - ++rit) { - _preimageBuffer.copy(rit->addr, rit->len, rit->offset); - } - } - - LOG(2) << " ***** ROLLING BACK " << (_changes.size()) << " custom changes"; - - try { - for (int i = _changes.size() - 1; i >= 0; i--) { - LOG(2) << "CUSTOM ROLLBACK " << redact(demangleName(typeid(*_changes[i]))); - _changes[i]->rollback(); - } - } catch (...) { - std::terminate(); - } -} - -bool DurRecoveryUnit::waitUntilDurable() { - invariant(!_inUnitOfWork); - return getDur().waitUntilDurable(); -} - -void DurRecoveryUnit::mergingWritingPtr(char* addr, size_t len) { - // The invariant is that all writes are non-overlapping and non-empty. So, a single - // writingPtr call may result in a number of new segments added. At this point, we cannot - // in general merge adjacent writes, as that would require inefficient operations on the - // preimage buffer. - - MergedWrites::iterator coveringWrite = _mergedWrites.upper_bound(Write(addr, 0, 0)); - - char* const end = addr + len; - while (addr < end) { - dassert(coveringWrite == _mergedWrites.end() || coveringWrite->end() > addr); - - // Determine whether addr[0] is already covered by a write or not. - // If covered, adjust addr and len to exclude the covered run from addr[0] onwards. - - if (coveringWrite != _mergedWrites.end()) { - char* const cwEnd = coveringWrite->end(); - - if (coveringWrite->addr <= addr) { - // If the begin of the covering write at or before addr[0], addr[0] is covered. - // While the existing pre-image will not generally be the same as the data - // being written now, during rollback only the oldest pre-image matters. - - if (end <= cwEnd) { - break; // fully covered - } - - addr = cwEnd; - coveringWrite++; - dassert(coveringWrite == _mergedWrites.end() || coveringWrite->addr >= cwEnd); - } - } - dassert(coveringWrite == _mergedWrites.end() || coveringWrite->end() > addr); - - // If the next coveringWrite overlaps, adjust the end of the uncovered region. - char* uncoveredEnd = end; - if (coveringWrite != _mergedWrites.end() && coveringWrite->addr < end) { - uncoveredEnd = coveringWrite->addr; - } - - const size_t uncoveredLen = uncoveredEnd - addr; - if (uncoveredLen) { - // We are writing to a region that hasn't been declared previously. - _mergedWrites.insert(Write(addr, uncoveredLen, _preimageBuffer.size())); - - // Windows requires us to adjust the address space *before* we write to anything. - privateViews.makeWritable(addr, uncoveredLen); - - if (!_rollbackWritesDisabled) { - _preimageBuffer.append(addr, uncoveredLen); - } - addr = uncoveredEnd; - } - } -} - -void* DurRecoveryUnit::writingPtr(void* addr, size_t len) { - invariant(_inUnitOfWork); - - if (len == 0) { - return addr; // Don't need to do anything for empty ranges. - } - - invariant(len < size_t(std::numeric_limits<int>::max())); - - _writeCount++; - _writeBytes += len; - char* const data = static_cast<char*>(addr); - - // The initial writes are stored in a faster, but less memory-efficient way. This will - // typically be enough for simple operations, where the extra cost of incremental - // coalescing and merging would be too much. For larger writes, more redundancy is - // is expected, so the cost of checking for duplicates is offset by savings in copying - // and allocating preimage buffers. Total memory use of the preimage buffer may be up to - // kMaxUnmergedPreimageBytes larger than the amount memory covered by the write intents. - - const size_t kMaxUnmergedPreimageBytes = kDebugBuild ? 16 * 1024 : 10 * 1024 * 1024; - - if (_preimageBuffer.size() + len > kMaxUnmergedPreimageBytes) { - mergingWritingPtr(data, len); - - // After a merged write, no more initial writes can occur or there would be an - // ordering violation during rollback. So, ensure that the if-condition will be true - // for any future write regardless of length. This is true now because - // mergingWritingPtr also will store its first write in _preimageBuffer as well. - invariant(_preimageBuffer.size() >= kMaxUnmergedPreimageBytes); - - return addr; - } - - // Windows requires us to adjust the address space *before* we write to anything. - privateViews.makeWritable(data, len); - - _initialWrites.push_back(Write(data, len, _preimageBuffer.size())); - - if (!_rollbackWritesDisabled) { - _preimageBuffer.append(data, len); - } - - return addr; -} - -void DurRecoveryUnit::setRollbackWritesDisabled() { - invariant(_inUnitOfWork); - _rollbackWritesDisabled = true; -} - -void DurRecoveryUnit::registerChange(Change* change) { - invariant(_inUnitOfWork); - _changes.push_back(change); -} - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/dur_recovery_unit.h b/src/mongo/db/storage/mmap_v1/dur_recovery_unit.h deleted file mode 100644 index b2c6dc0f20c..00000000000 --- a/src/mongo/db/storage/mmap_v1/dur_recovery_unit.h +++ /dev/null @@ -1,171 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include <set> -#include <string> -#include <utility> -#include <vector> - -#include "mongo/base/owned_pointer_vector.h" -#include "mongo/db/storage/recovery_unit.h" -#include "mongo/platform/compiler.h" - -#pragma once - -namespace mongo { - -/** - * Just pass through to getDur(). - */ -class DurRecoveryUnit : public RecoveryUnit { -public: - DurRecoveryUnit(); - - void beginUnitOfWork(OperationContext* opCtx) final; - void commitUnitOfWork() final; - void abortUnitOfWork() final; - - virtual bool waitUntilDurable(); - - virtual void abandonSnapshot(); - - // The recovery unit takes ownership of change. - virtual void registerChange(Change* change); - - virtual void* writingPtr(void* addr, size_t len); - - virtual void setRollbackWritesDisabled(); - - virtual SnapshotId getSnapshotId() const { - return SnapshotId(); - } - - virtual void setOrderedCommit(bool orderedCommit) {} - -private: - /** - * Marks writes for journaling, if enabled, and then commits all other Changes in order. - * Returns with empty _initialWrites, _mergedWrites, _changes and _preimageBuffer, but - * does not reset the _rollbackWritesDisabled or _mustRollback flags. This leaves the - * RecoveryUnit ready for more changes that may be committed or rolled back. - */ - void commitChanges(); - - /** - * Creates a list of write intents to be journaled, and hands it of to the active - * DurabilityInterface. - */ - void markWritesForJournaling(); - - /** - * Restores state by rolling back all writes using the saved pre-images, and then - * rolling back all other Changes in LIFO order. Resets internal state. - */ - void rollbackChanges(); - - - /** - * Version of writingPtr that checks existing writes for overlap and only stores those - * changes not yet covered by an existing write intent and pre-image. - */ - void mergingWritingPtr(char* data, size_t len); - - /** - * Reset to a clean state without any uncommitted changes or write. - */ - void resetChanges(); - - // Changes are ordered from oldest to newest. - typedef OwnedPointerVector<Change> Changes; - Changes _changes; - - - // Number of pending uncommitted writes. Incremented even if new write is fully covered by - // existing writes. - size_t _writeCount; - // Total size of the pending uncommitted writes. - size_t _writeBytes; - - /** - * These are memory writes inside the mmapv1 mmap-ed files. A pointer past the end is just - * instead of a pointer to the beginning for the benefit of MergedWrites. - */ - struct Write { - Write(char* addr, int len, int offset) : addr(addr), len(len), offset(offset) {} - Write(const Write& rhs) : addr(rhs.addr), len(rhs.len), offset(rhs.offset) {} - Write() : addr(0), len(0), offset(0) {} - bool operator<(const Write& rhs) const { - return addr < rhs.addr; - } - - struct compareEnd { - bool operator()(const Write& lhs, const Write& rhs) const { - return lhs.addr + lhs.len < rhs.addr + rhs.len; - } - }; - - char* end() const { - return addr + len; - } - - char* addr; - int len; - int offset; // index into _preimageBuffer - }; - - /** - * Writes are ordered by ending address, so MergedWrites::upper_bound() can find the first - * overlapping write, if any. Overlapping and duplicate regions are forbidden, as rollback - * of MergedChanges undoes changes by address rather than LIFO order. In addition, empty - * regions are not allowed. Storing writes by age does not work well for large indexed - * arrays, as coalescing is needed to bound the size of the preimage buffer. - */ - typedef std::set<Write, Write::compareEnd> MergedWrites; - MergedWrites _mergedWrites; - - // Generally it's more efficient to just store pre-images unconditionally and then - // sort/eliminate duplicates at commit time. However, this can lead to excessive memory - // use in cases involving large indexes arrays, where the same memory is written many - // times. To keep the speed for the general case and bound memory use, the first few MB of - // pre-images are stored unconditionally, but once the threshold has been exceeded, the - // remainder is stored in a more space-efficient datastructure. - typedef std::vector<Write> InitialWrites; - InitialWrites _initialWrites; - - std::string _preimageBuffer; - - bool _inUnitOfWork; - - - // Default is false. - // If true, no preimages are tracked. If rollback is subsequently attempted, the process - // will abort. - bool _rollbackWritesDisabled; -}; - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/dur_stats.h b/src/mongo/db/storage/mmap_v1/dur_stats.h deleted file mode 100644 index 0b3daf7f021..00000000000 --- a/src/mongo/db/storage/mmap_v1/dur_stats.h +++ /dev/null @@ -1,96 +0,0 @@ -// @file dur_stats.h - -/** -* Copyright (C) 2012 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#include "mongo/db/jsobj.h" - -namespace mongo { -namespace dur { - -/** - * journaling stats. the model here is that the commit thread is the only writer, and that reads - * are uncommon (from a serverStatus command and such). Thus, there should not be multicore chatter - * overhead. - */ -struct Stats { - struct S { - std::string _CSVHeader() const; - std::string _asCSV() const; - - void _asObj(BSONObjBuilder* builder) const; - - void reset(); - - uint64_t getCurrentDurationMillis() const { - return ((curTimeMicros64() - _startTimeMicros) / 1000); - } - - - // Not reported. Internal use only. - uint64_t _startTimeMicros; - - // Reported statistics - unsigned _durationMillis; - - unsigned _commits; - unsigned _commitsInWriteLock; - - uint64_t _journaledBytes; - uint64_t _uncompressedBytes; - uint64_t _writeToDataFilesBytes; - - uint64_t _prepLogBufferMicros; - uint64_t _writeToJournalMicros; - uint64_t _writeToDataFilesMicros; - uint64_t _remapPrivateViewMicros; - uint64_t _commitsMicros; - uint64_t _commitsInWriteLockMicros; - }; - - - Stats(); - void reset(); - - BSONObj asObj() const; - - const S* curr() const { - return &_stats[_currIdx]; - } - S* curr() { - return &_stats[_currIdx]; - } - -private: - S _stats[5]; - unsigned _currIdx; -}; - -extern Stats stats; -} -} diff --git a/src/mongo/db/storage/mmap_v1/durable_mapped_file.cpp b/src/mongo/db/storage/mmap_v1/durable_mapped_file.cpp deleted file mode 100644 index fd199817f11..00000000000 --- a/src/mongo/db/storage/mmap_v1/durable_mapped_file.cpp +++ /dev/null @@ -1,315 +0,0 @@ -// durable_mapped_file.cpp - -/** -* Copyright (C) 2010 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -/** - * this module adds some of our layers atop memory mapped files - specifically our handling of - * private views & such if you don't care about journaling/durability (temp sort files & such) use - * MemoryMappedFile class, not this. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/durable_mapped_file.h" - -#include <utility> - -#include "mongo/db/concurrency/d_concurrency.h" -#include "mongo/db/storage/mmap_v1/dur.h" -#include "mongo/db/storage/mmap_v1/dur_journalformat.h" -#include "mongo/db/storage/storage_options.h" -#include "mongo/util/log.h" -#include "mongo/util/mongoutils/str.h" -#include "mongo/util/processinfo.h" - -using namespace mongoutils; - -namespace mongo { - -using std::dec; -using std::endl; -using std::hex; -using std::map; -using std::pair; -using std::string; - -void DurableMappedFile::remapThePrivateView(OperationContext* opCtx) { - verify(storageGlobalParams.dur); - - _willNeedRemap = false; - - // todo 1.9 : it turns out we require that we always remap to the same address. - // so the remove / add isn't necessary and can be removed? - void* old = _view_private; - // privateViews.remove(_view_private); - _view_private = remapPrivateView(opCtx, _view_private); - // privateViews.add(_view_private, this); - fassert(16112, _view_private == old); -} - -/** register view. threadsafe */ -void PointerToDurableMappedFile::add_inlock(void* view, DurableMappedFile* f) { - verify(view); - verify(f); - clearWritableBits_inlock(view, f->length()); - _views.insert(pair<void*, DurableMappedFile*>(view, f)); -} - -/** de-register view. threadsafe */ -void PointerToDurableMappedFile::remove(void* view, size_t len) { - if (view) { - stdx::lock_guard<stdx::mutex> lk(_m); - clearWritableBits_inlock(view, len); - _views.erase(view); - } -} - -#ifdef _WIN32 -void PointerToDurableMappedFile::clearWritableBits(void* privateView, size_t len) { - stdx::lock_guard<stdx::mutex> lk(_m); - clearWritableBits_inlock(privateView, len); -} - -/** notification on unmapping so we can clear writable bits */ -void PointerToDurableMappedFile::clearWritableBits_inlock(void* privateView, size_t len) { - for (unsigned i = reinterpret_cast<size_t>(privateView) / MemoryMappedCOWBitset::ChunkSize; - i <= (reinterpret_cast<size_t>(privateView) + len) / MemoryMappedCOWBitset::ChunkSize; - ++i) { - writable.clear(i); - dassert(!writable.get(i)); - } -} - -extern stdx::mutex mapViewMutex; - -__declspec(noinline) void PointerToDurableMappedFile::makeChunkWritable(size_t chunkno) { - stdx::lock_guard<stdx::mutex> lkPrivateViews(_m); - - if (writable.get(chunkno)) // double check lock - return; - - // remap all maps in this chunk. - // common case is a single map, but could have more than one with smallfiles or .ns files - size_t chunkStart = chunkno * MemoryMappedCOWBitset::ChunkSize; - size_t chunkNext = chunkStart + MemoryMappedCOWBitset::ChunkSize; - - stdx::lock_guard<stdx::mutex> lkMapView(mapViewMutex); - - map<void*, DurableMappedFile*>::iterator i = _views.upper_bound((void*)(chunkNext - 1)); - while (1) { - const pair<void*, DurableMappedFile*> x = *(--i); - DurableMappedFile* mmf = x.second; - if (mmf == 0) - break; - - size_t viewStart = reinterpret_cast<size_t>(x.first); - size_t viewEnd = viewStart + mmf->length(); - if (viewEnd <= chunkStart) - break; - - size_t protectStart = std::max(viewStart, chunkStart); - dassert(protectStart < chunkNext); - - size_t protectEnd = std::min(viewEnd, chunkNext); - size_t protectSize = protectEnd - protectStart; - dassert(protectSize > 0 && protectSize <= MemoryMappedCOWBitset::ChunkSize); - - DWORD oldProtection; - bool ok = VirtualProtect( - reinterpret_cast<void*>(protectStart), protectSize, PAGE_WRITECOPY, &oldProtection); - if (!ok) { - DWORD dosError = GetLastError(); - - if (dosError == ERROR_COMMITMENT_LIMIT) { - // System has run out of memory between physical RAM & page file, tell the user - BSONObjBuilder bb; - - ProcessInfo p; - p.getExtraInfo(bb); - - severe() << "MongoDB has exhausted the system memory capacity."; - severe() << "Current Memory Status: " << bb.obj(); - } - - severe() << "VirtualProtect for " << mmf->filename() << " chunk " << chunkno - << " failed with " << errnoWithDescription(dosError) << " (chunk size is " - << protectSize << ", address is " << hex << protectStart << dec << ")" - << " in mongo::makeChunkWritable, terminating" << endl; - - fassertFailed(16362); - } - } - - writable.set(chunkno); -} -#else -void PointerToDurableMappedFile::clearWritableBits(void* privateView, size_t len) {} - -void PointerToDurableMappedFile::clearWritableBits_inlock(void* privateView, size_t len) {} -#endif - -PointerToDurableMappedFile::PointerToDurableMappedFile() { -#if defined(SIZE_MAX) - size_t max = SIZE_MAX; -#else - size_t max = ~((size_t)0); -#endif - verify(max > (size_t) this); // just checking that no one redef'd SIZE_MAX and that it is sane - - // this way we don't need any boundary checking in _find() - _views.insert(pair<void*, DurableMappedFile*>((void*)0, (DurableMappedFile*)0)); - _views.insert(pair<void*, DurableMappedFile*>((void*)max, (DurableMappedFile*)0)); -} - -/** underscore version of find is for when you are already locked - @param ofs out return our offset in the view - @return the DurableMappedFile to which this pointer belongs -*/ -DurableMappedFile* PointerToDurableMappedFile::find_inlock(void* p, /*out*/ size_t& ofs) { - // - // .................memory.......................... - // v1 p v2 - // [--------------------] [-------] - // - // e.g., _find(p) == v1 - // - const pair<void*, DurableMappedFile*> x = *(--_views.upper_bound(p)); - DurableMappedFile* mmf = x.second; - if (mmf) { - size_t o = ((char*)p) - ((char*)x.first); - if (o < mmf->length()) { - ofs = o; - return mmf; - } - } - return 0; -} - -/** find associated MMF object for a given pointer. - threadsafe - @param ofs out returns offset into the view of the pointer, if found. - @return the DurableMappedFile to which this pointer belongs. null if not found. -*/ -DurableMappedFile* PointerToDurableMappedFile::find(void* p, /*out*/ size_t& ofs) { - stdx::lock_guard<stdx::mutex> lk(_m); - return find_inlock(p, ofs); -} - -PointerToDurableMappedFile privateViews; - -// here so that it is precomputed... -void DurableMappedFile::setPath(const std::string& f) { - string suffix; - string prefix; - bool ok = str::rSplitOn(f, '.', prefix, suffix); - uassert(13520, - str::stream() << "DurableMappedFile only supports filenames in a certain format " << f, - ok); - if (suffix == "ns") - _fileSuffixNo = dur::JEntry::DotNsSuffix; - else - _fileSuffixNo = (int)str::toUnsigned(suffix); - - _p = RelativePath::fromFullPath(storageGlobalParams.dbpath, prefix); -} - -bool DurableMappedFile::open(OperationContext* opCtx, const std::string& fname) { - LOG(3) << "mmf open " << fname; - invariant(!_view_write); - - setPath(fname); - _view_write = map(opCtx, fname.c_str()); - fassert(16333, _view_write); - return finishOpening(); -} - -bool DurableMappedFile::create(OperationContext* opCtx, - const std::string& fname, - unsigned long long& len) { - LOG(3) << "mmf create " << fname; - invariant(!_view_write); - - setPath(fname); - _view_write = map(opCtx, fname.c_str(), len); - fassert(16332, _view_write); - return finishOpening(); -} - -bool DurableMappedFile::finishOpening() { - LOG(3) << "mmf finishOpening " << (void*)_view_write << ' ' << filename() - << " len:" << length(); - if (_view_write) { - if (storageGlobalParams.dur) { - stdx::lock_guard<stdx::mutex> lk2(privateViews._mutex()); - - _view_private = createPrivateMap(); - if (_view_private == 0) { - severe() << "file " << filename() << " open/create failed in createPrivateMap"; - fassertFailed(13636); - } - // note that testIntent builds use this, even though it points to view_write then... - privateViews.add_inlock(_view_private, this); - } else { - _view_private = _view_write; - } - return true; - } - return false; -} - -void DurableMappedFile::close(OperationContext* opCtx) { - try { - LOG(3) << "mmf close " << filename(); - - // If _view_private was not set, this means file open failed - if (_view_private) { - // Notify the durability system that we are closing a file so it can ensure we - // will not have journaled operations with no corresponding file. - getDur().closingFileNotification(); - } - - privateViews.remove(_view_private, length()); - - MemoryMappedFile::close(opCtx); - } catch (...) { - error() << "exception in DurableMappedFile::close"; - } -} - -DurableMappedFile::DurableMappedFile(OperationContext* opCtx, OptionSet options) - : MemoryMappedFile(opCtx, options), _willNeedRemap(false) { - _view_write = _view_private = 0; -} - -DurableMappedFile::~DurableMappedFile() { - invariant(isClosed()); -} -} diff --git a/src/mongo/db/storage/mmap_v1/durable_mapped_file.h b/src/mongo/db/storage/mmap_v1/durable_mapped_file.h deleted file mode 100644 index 7050156fd25..00000000000 --- a/src/mongo/db/storage/mmap_v1/durable_mapped_file.h +++ /dev/null @@ -1,289 +0,0 @@ -// durable_mapped_file.h - -/* -* -* Copyright (C) 2008 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include "mongo/base/static_assert.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/storage/mmap_v1/mmap.h" -#include "mongo/db/storage/mmap_v1/paths.h" -#include "mongo/stdx/mutex.h" - -namespace mongo { - -/** - * DurableMappedFile adds some layers atop memory mapped files - specifically our handling of - * private views & such. if you don't care about journaling/durability (temp sort files & such) use - * MemoryMappedFile class, not this. - */ -class DurableMappedFile : private MemoryMappedFile { -protected: - virtual void* viewForFlushing() { - return _view_write; - } - -public: - explicit DurableMappedFile(OperationContext* opCtx, OptionSet options = NONE); - virtual ~DurableMappedFile(); - - /** - * Callers must be holding a `LockMongoFilesExclusive`. - */ - virtual void close(OperationContext* opCtx); - - /** @return true if opened ok. */ - bool open(OperationContext* opCtx, const std::string& fname); - - /** @return file length */ - unsigned long long length() const { - return MemoryMappedFile::length(); - } - - std::string filename() const { - return MemoryMappedFile::filename(); - } - - void flush(bool sync) { - MemoryMappedFile::flush(sync); - } - - /* Creates with length if DNE, otherwise uses existing file length, - passed length. - @return true for ok - */ - bool create(OperationContext* opCtx, const std::string& fname, unsigned long long& len); - - /* Get the "standard" view (which is the private one). - @return the private view. - */ - void* getView() const { - return _view_private; - } - - /* Get the "write" view (which is required for writing). - @return the write view. - */ - void* view_write() const { - return _view_write; - } - - /** for a filename a/b/c.3 - filePath() is "a/b/c" - fileSuffixNo() is 3 - if the suffix is "ns", fileSuffixNo -1 - */ - const RelativePath& relativePath() const { - DEV verify(!_p._p.empty()); - return _p; - } - - int fileSuffixNo() const { - return _fileSuffixNo; - } - HANDLE getFd() { - return MemoryMappedFile::getFd(); - } - - /** true if we have written. - set in PREPLOGBUFFER, it is NOT set immediately on write intent declaration. - reset to false in REMAPPRIVATEVIEW - */ - bool willNeedRemap() { - return _willNeedRemap; - } - void setWillNeedRemap() { - _willNeedRemap = true; - } - - void remapThePrivateView(OperationContext* opCtx); - - virtual bool isDurableMappedFile() { - return true; - } - -private: - void* _view_write; - void* _view_private; - bool _willNeedRemap; - RelativePath _p; // e.g. "somepath/dbname" - int _fileSuffixNo; // e.g. 3. -1="ns" - - void setPath(const std::string& pathAndFileName); - bool finishOpening(); -}; - - -#ifdef _WIN32 -// Simple array based bitset to track COW chunks in memory mapped files on Windows -// A chunk is a 64MB granular region in virtual memory that we mark as COW everytime we need -// to write to a memory mapped files on Windows -// -class MemoryMappedCOWBitset { - MONGO_DISALLOW_COPYING(MemoryMappedCOWBitset); - -public: - // Size of the chunks we mark Copy-On-Write with VirtualProtect - static const unsigned long long ChunkSize = 64 * 1024 * 1024; - - // Number of chunks we store in our bitset which are really 32-bit ints - static const unsigned long long NChunks = 64 * 1024; - - // Total Virtual Memory space we can cover with the bitset - static const unsigned long long MaxChunkMemory = ChunkSize * NChunks * sizeof(unsigned int) * 8; - - // Size in bytes of the bitset we allocate - static const unsigned long long MaxChunkBytes = NChunks * sizeof(unsigned int); - - // 128 TB Virtual Memory space in Windows 8.1/2012 R2, 8TB before - static const unsigned long long MaxWinMemory = 128ULL * 1024 * 1024 * 1024 * 1024; - - // Make sure that the chunk memory covers the Max Windows user process VM space - MONGO_STATIC_ASSERT_MSG(MaxChunkMemory == MaxWinMemory, - "Need a larger bitset to cover max process VM space"); - -public: - MemoryMappedCOWBitset() { - MONGO_STATIC_ASSERT_MSG(MemoryMappedCOWBitset::MaxChunkBytes == sizeof(bits), - "Validate our predicted bitset size is correct"); - } - - bool get(uintptr_t i) const { - uintptr_t x = i / 32; - verify(x < MemoryMappedCOWBitset::NChunks); - return (bits[x].loadRelaxed() & (1 << (i % 32))) != 0; - } - - // Note: assumes caller holds privateViews.mutex - void set(uintptr_t i) { - uintptr_t x = i / 32; - verify(x < MemoryMappedCOWBitset::NChunks); - bits[x].store(bits[x].loadRelaxed() | (1 << (i % 32))); - } - - // Note: assumes caller holds privateViews.mutex - void clear(uintptr_t i) { - uintptr_t x = i / 32; - verify(x < MemoryMappedCOWBitset::NChunks); - bits[x].store(bits[x].loadRelaxed() & ~(1 << (i % 32))); - } - -private: - // atomic as we are doing double check locking - AtomicUInt32 bits[MemoryMappedCOWBitset::NChunks]; -}; -#endif - -/** for durability support we want to be able to map pointers to specific DurableMappedFile objects. -*/ -class PointerToDurableMappedFile { - MONGO_DISALLOW_COPYING(PointerToDurableMappedFile); - -public: - PointerToDurableMappedFile(); - - /** register view. - not-threadsafe, caller must hold _mutex() - */ - void add_inlock(void* view, DurableMappedFile* f); - - /** de-register view. - threadsafe - */ - void remove(void* view, size_t length); - - /** find associated MMF object for a given pointer. - threadsafe - @param ofs out returns offset into the view of the pointer, if found. - @return the DurableMappedFile to which this pointer belongs. null if not found. - */ - DurableMappedFile* find(void* p, /*out*/ size_t& ofs); - - /** for doing many finds in a row with one lock operation */ - stdx::mutex& _mutex() { - return _m; - } - - /** not-threadsafe, caller must hold _mutex() */ - DurableMappedFile* find_inlock(void* p, /*out*/ size_t& ofs); - - /** not-threadsafe, caller must hold _mutex() */ - unsigned numberOfViews_inlock() const { - return _views.size(); - } - - /** make the private map range writable (necessary for our windows implementation) */ - void makeWritable(void*, unsigned len); - - void clearWritableBits(void* privateView, size_t len); - -private: - void clearWritableBits_inlock(void* privateView, size_t len); - -#ifdef _WIN32 - void makeChunkWritable(size_t chunkno); -#endif - -private: - // PointerToDurableMappedFile Mutex - // - // Protects: - // Protects internal consistency of data structure - // Lock Ordering: - // Must be taken before MapViewMutex if both are taken to prevent deadlocks - stdx::mutex _m; - std::map<void*, DurableMappedFile*> _views; - -#ifdef _WIN32 - // Tracks which memory mapped regions are marked as Copy on Write - MemoryMappedCOWBitset writable; -#endif -}; - -#ifdef _WIN32 -inline void PointerToDurableMappedFile::makeWritable(void* privateView, unsigned len) { - size_t p = reinterpret_cast<size_t>(privateView); - unsigned a = p / MemoryMappedCOWBitset::ChunkSize; - unsigned b = (p + len) / MemoryMappedCOWBitset::ChunkSize; - - for (unsigned i = a; i <= b; i++) { - if (!writable.get(i)) { - makeChunkWritable(i); - } - } -} -#else -inline void PointerToDurableMappedFile::makeWritable(void* _p, unsigned len) {} -#endif - -// allows a pointer into any private view of a DurableMappedFile to be resolved to the -// DurableMappedFile object -extern PointerToDurableMappedFile privateViews; -} diff --git a/src/mongo/db/storage/mmap_v1/durop.cpp b/src/mongo/db/storage/mmap_v1/durop.cpp deleted file mode 100644 index 627d53df05d..00000000000 --- a/src/mongo/db/storage/mmap_v1/durop.cpp +++ /dev/null @@ -1,179 +0,0 @@ -// @file durop.cpp - -/** -* Copyright (C) 2010 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/durop.h" - -#include <boost/filesystem/operations.hpp> - -#include "mongo/db/concurrency/d_concurrency.h" -#include "mongo/db/storage/mmap_v1/aligned_builder.h" -#include "mongo/db/storage/mmap_v1/durable_mapped_file.h" -#include "mongo/db/storage/mmap_v1/file_allocator.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_engine.h" -#include "mongo/util/file.h" -#include "mongo/util/log.h" -#include "mongo/util/mongoutils/str.h" - -namespace mongo { - -using std::unique_ptr; -using std::shared_ptr; -using std::endl; -using std::string; - -namespace dur { - -/** read a durop from journal file referenced by br. - @param opcode the opcode which has already been written from the bufreader -*/ -shared_ptr<DurOp> DurOp::read(unsigned opcode, BufReader& br) { - shared_ptr<DurOp> op; - switch (opcode) { - case JEntry::OpCode_FileCreated: - op = shared_ptr<DurOp>(new FileCreatedOp(br)); - break; - case JEntry::OpCode_DropDb: - op = shared_ptr<DurOp>(new DropDbOp(br)); - break; - default: - massert(13546, - (str::stream() << "journal recover: unrecognized opcode in journal " << opcode), - false); - } - return op; -} - -void DurOp::serialize(AlignedBuilder& ab) { - ab.appendNum(_opcode); - _serialize(ab); -} - -DropDbOp::DropDbOp(BufReader& log) : DurOp(JEntry::OpCode_DropDb) { - unsigned long long reserved; - log.read(reserved); - log.read(reserved); - log.readStr(_db); - string reservedStr; - log.readStr(reservedStr); -} - -void DropDbOp::_serialize(AlignedBuilder& ab) { - ab.appendNum((unsigned long long)0); // reserved for future use - ab.appendNum((unsigned long long)0); // reserved for future use - ab.appendStr(_db); - ab.appendStr(""); // reserved -} - -/** throws */ -void DropDbOp::replay() { - log() << "recover replay drop db " << _db << endl; - _deleteDataFiles(_db); -} - -FileCreatedOp::FileCreatedOp(const std::string& f, unsigned long long l) - : DurOp(JEntry::OpCode_FileCreated) { - _p = RelativePath::fromFullPath(storageGlobalParams.dbpath, f); - _len = l; -} - -FileCreatedOp::FileCreatedOp(BufReader& log) : DurOp(JEntry::OpCode_FileCreated) { - unsigned long long reserved; - log.read(reserved); - log.read(reserved); - log.read(_len); // size of file, not length of name - string s; - log.readStr(s); - _p._p = s; -} - -void FileCreatedOp::_serialize(AlignedBuilder& ab) { - ab.appendNum((unsigned long long)0); // reserved for future use - ab.appendNum((unsigned long long)0); // reserved for future use - ab.appendNum(_len); - ab.appendStr(_p.toString()); -} - -string FileCreatedOp::toString() { - return str::stream() << "FileCreatedOp " << _p.toString() << ' ' << _len / 1024.0 / 1024.0 - << "MB"; -} - -// if an operation deletes or creates a file (or moves etc.), it may need files closed. -bool FileCreatedOp::needFilesClosed() { - return boost::filesystem::exists(_p.asFullPath()); -} - -void FileCreatedOp::replay() { - // i believe the code assumes new files are filled with zeros. thus we have to recreate the - // file, or rewrite at least, even if it were the right length. perhaps one day we should - // change that although easier to avoid defects if we assume it is zeros perhaps. - string full = _p.asFullPath(); - if (boost::filesystem::exists(full)) { - try { - boost::filesystem::remove(full); - } catch (std::exception& e) { - LOG(1) << "recover info FileCreateOp::replay unlink " << e.what() << endl; - } - } - - log() << "recover create file " << full << ' ' << _len / 1024.0 / 1024.0 << "MB" << endl; - if (boost::filesystem::exists(full)) { - // first delete if exists. - try { - boost::filesystem::remove(full); - } catch (...) { - log() << "warning could not delete file " << full << endl; - } - } - ensureParentDirCreated(full); - File f; - f.open(full.c_str()); - massert(13547, str::stream() << "recover couldn't create file " << full, f.is_open()); - unsigned long long left = _len; - const unsigned blksz = 64 * 1024; - unique_ptr<char[]> v(new char[blksz]); - memset(v.get(), 0, blksz); - fileofs ofs = 0; - while (left) { - unsigned long long w = left < blksz ? left : blksz; - f.write(ofs, v.get(), (unsigned)w); - left -= w; - ofs += w; - } - f.fsync(); - flushMyDirectory(full); - massert(13628, str::stream() << "recover failure writing file " << full, !f.bad()); -} -} -} diff --git a/src/mongo/db/storage/mmap_v1/durop.h b/src/mongo/db/storage/mmap_v1/durop.h deleted file mode 100644 index 17a78ff220d..00000000000 --- a/src/mongo/db/storage/mmap_v1/durop.h +++ /dev/null @@ -1,129 +0,0 @@ -// @file durop.h class DurOp and descendants - -/** -* Copyright (C) 2010 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - - -#include "mongo/db/storage/mmap_v1/dur_journalformat.h" -#include "mongo/db/storage/mmap_v1/paths.h" -#include "mongo/util/bufreader.h" - -namespace mongo { - -class AlignedBuilder; - -namespace dur { - -/** DurOp - Operations we journal that aren't just basic writes. - * - * Basic writes are logged as JEntry's, and indicated in ram temporarily as struct - * dur::WriteIntent. We don't make WriteIntent inherit from DurOp to keep it as lean as possible as - * there will be millions of them (we don't want a vtable for example there). - * - * For each op we want to journal, we define a subclass. - */ -class DurOp { /* copyable */ -public: - // @param opcode a sentinel value near max unsigned which uniquely identifies the operation. - // @see dur::JEntry - DurOp(unsigned opcode) : _opcode(opcode) {} - - virtual ~DurOp() {} - - /** serialize the op out to a builder which will then be written (presumably) to the journal */ - void serialize(AlignedBuilder& ab); - - /** read a durop from journal file referenced by br. - @param opcode the opcode which has already been written from the bufreader - */ - static std::shared_ptr<DurOp> read(unsigned opcode, BufReader& br); - - /** replay the operation (during recovery) - throws - - For now, these are not replayed during the normal WRITETODATAFILES phase, since these - operations are handled in other parts of the code. At some point this may change. - */ - virtual void replay() = 0; - - virtual std::string toString() = 0; - - /** if the op requires all file to be closed before doing its work, returns true. */ - virtual bool needFilesClosed() { - return false; - } - -protected: - /** DurOp will have already written the opcode for you */ - virtual void _serialize(AlignedBuilder& ab) = 0; - -private: - const unsigned _opcode; -}; - -/** indicates creation of a new file */ -class FileCreatedOp : public DurOp { -public: - FileCreatedOp(BufReader& log); - /** param f filename to create with path */ - FileCreatedOp(const std::string& f, unsigned long long l); - virtual void replay(); - virtual std::string toString(); - virtual bool needFilesClosed(); - -protected: - virtual void _serialize(AlignedBuilder& ab); - -private: - RelativePath _p; - unsigned long long _len; // size of file, not length of name -}; - -/** record drop of a database */ -class DropDbOp : public DurOp { -public: - DropDbOp(BufReader& log); - DropDbOp(const std::string& db) : DurOp(JEntry::OpCode_DropDb), _db(db) {} - virtual void replay(); - virtual std::string toString() { - return std::string("DropDbOp ") + _db; - } - virtual bool needFilesClosed() { - return true; - } - -protected: - virtual void _serialize(AlignedBuilder& ab); - -private: - std::string _db; -}; -} -} diff --git a/src/mongo/db/storage/mmap_v1/extent.cpp b/src/mongo/db/storage/mmap_v1/extent.cpp deleted file mode 100644 index 92dd07933b6..00000000000 --- a/src/mongo/db/storage/mmap_v1/extent.cpp +++ /dev/null @@ -1,112 +0,0 @@ -// extent.cpp - -/** -* Copyright (C) 2013 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#include "mongo/db/storage/mmap_v1/extent.h" - -#include "mongo/base/static_assert.h" -#include "mongo/db/storage/mmap_v1/extent_manager.h" -#include "mongo/util/hex.h" -#include "mongo/util/mongoutils/str.h" - -namespace mongo { - -using std::iostream; -using std::string; -using std::vector; - -MONGO_STATIC_ASSERT(sizeof(Extent) - 4 == 48 + 128); - -BSONObj Extent::dump() const { - return BSON("loc" << myLoc.toString() << "xnext" << xnext.toString() << "xprev" - << xprev.toString() - << "nsdiag" - << nsDiagnostic.toString() - << "size" - << length - << "firstRecord" - << firstRecord.toString() - << "lastRecord" - << lastRecord.toString()); -} - -void Extent::dump(iostream& s) const { - s << " loc:" << myLoc.toString() << " xnext:" << xnext.toString() - << " xprev:" << xprev.toString() << '\n'; - s << " nsdiag:" << nsDiagnostic.toString() << '\n'; - s << " size:" << length << " firstRecord:" << firstRecord.toString() - << " lastRecord:" << lastRecord.toString() << '\n'; -} - -bool Extent::validates(const DiskLoc diskLoc, vector<string>* errors) const { - bool extentOk = true; - if (magic != extentSignature) { - if (errors) { - StringBuilder sb; - sb << "bad extent signature " << integerToHex(magic) << " in extent " - << diskLoc.toString(); - errors->push_back(sb.str()); - } - extentOk = false; - } - if (myLoc != diskLoc) { - if (errors) { - StringBuilder sb; - sb << "extent " << diskLoc.toString() << " self-pointer is " << myLoc.toString(); - errors->push_back(sb.str()); - } - extentOk = false; - } - if (firstRecord.isNull() != lastRecord.isNull()) { - if (errors) { - StringBuilder sb; - if (firstRecord.isNull()) { - sb << "in extent " << diskLoc.toString() - << ", firstRecord is null but lastRecord is " << lastRecord.toString(); - } else { - sb << "in extent " << diskLoc.toString() << ", firstRecord is " - << firstRecord.toString() << " but lastRecord is null"; - } - errors->push_back(sb.str()); - } - extentOk = false; - } - static const int minSize = 0x1000; - if (length < minSize) { - if (errors) { - StringBuilder sb; - sb << "length of extent " << diskLoc.toString() << " is " << length - << ", which is less than minimum length of " << minSize; - errors->push_back(sb.str()); - } - extentOk = false; - } - return extentOk; -} -} diff --git a/src/mongo/db/storage/mmap_v1/extent.h b/src/mongo/db/storage/mmap_v1/extent.h deleted file mode 100644 index 16af89fb42b..00000000000 --- a/src/mongo/db/storage/mmap_v1/extent.h +++ /dev/null @@ -1,89 +0,0 @@ -// extent.h - -/** -* Copyright (C) 2013 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include <iosfwd> -#include <string> -#include <vector> - -#include "mongo/db/storage/mmap_v1/catalog/namespace.h" -#include "mongo/db/storage/mmap_v1/diskloc.h" - -namespace mongo { - -/* extents are datafile regions where all the records within the region - belong to the same namespace. - -(11:12:35 AM) dm10gen: when the extent is allocated, all its empty space is stuck into one big - DeletedRecord -(11:12:55 AM) dm10gen: and that is placed on the free list -*/ -#pragma pack(1) -struct Extent { - enum { extentSignature = 0x41424344 }; - unsigned magic; - DiskLoc myLoc; - - /* next/prev extent for this namespace */ - DiskLoc xnext; - DiskLoc xprev; - - /* which namespace this extent is for. this is just for troubleshooting really - and won't even be correct if the collection were renamed! - */ - Namespace nsDiagnostic; - - int length; /* size of the extent, including these fields */ - DiskLoc firstRecord; - DiskLoc lastRecord; - char _extentData[4]; - - // ----- - - bool validates(const DiskLoc diskLoc, std::vector<std::string>* errors = NULL) const; - - BSONObj dump() const; - - void dump(std::iostream& s) const; - - bool isOk() const { - return magic == extentSignature; - } - void assertOk() const { - verify(isOk()); - } - - static int HeaderSize() { - return sizeof(Extent) - 4; - } -}; -#pragma pack() -} diff --git a/src/mongo/db/storage/mmap_v1/extent_manager.cpp b/src/mongo/db/storage/mmap_v1/extent_manager.cpp deleted file mode 100644 index 15222fac01a..00000000000 --- a/src/mongo/db/storage/mmap_v1/extent_manager.cpp +++ /dev/null @@ -1,97 +0,0 @@ -// extent_manager.cpp - -/** -* Copyright (C) 2013 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#include "mongo/db/storage/mmap_v1/extent_manager.h" - -#include "mongo/db/storage/mmap_v1/extent.h" - -namespace mongo { - -int ExtentManager::quantizeExtentSize(int size) const { - if (size == maxSize()) { - // no point doing quantizing for the entire file - return size; - } - - invariant(size <= maxSize()); - - // make sizes align with VM page size - int newSize = (size + 0xfff) & 0xfffff000; - - if (newSize > maxSize()) { - return maxSize(); - } - - if (newSize < minSize()) { - return minSize(); - } - - return newSize; -} - -int ExtentManager::followupSize(int len, int lastExtentLen) const { - invariant(len < maxSize()); - int x = initialSize(len); - // changed from 1.20 to 1.35 in v2.1.x to get to larger extent size faster - int y = (int)(lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.35); - int sz = y > x ? y : x; - - if (sz < lastExtentLen) { - // this means there was an int overflow - // so we should turn it into maxSize - return maxSize(); - } else if (sz > maxSize()) { - return maxSize(); - } - - sz = quantizeExtentSize(sz); - verify(sz >= len); - - return sz; -} - -int ExtentManager::initialSize(int len) const { - invariant(len <= maxSize()); - - long long sz = len * 16; - if (len < 1000) - sz = len * 64; - - if (sz >= maxSize()) - return maxSize(); - - if (sz <= minSize()) - return minSize(); - - int z = ExtentManager::quantizeExtentSize(sz); - verify(z >= len); - return z; -} -} diff --git a/src/mongo/db/storage/mmap_v1/extent_manager.h b/src/mongo/db/storage/mmap_v1/extent_manager.h deleted file mode 100644 index 6b0e18c44f3..00000000000 --- a/src/mongo/db/storage/mmap_v1/extent_manager.h +++ /dev/null @@ -1,197 +0,0 @@ -// extent_manager.h - -/** -* Copyright (C) 2013 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include <memory> -#include <string> -#include <vector> - -#include "mongo/base/status.h" -#include "mongo/base/string_data.h" -#include "mongo/db/storage/mmap_v1/diskloc.h" - -namespace mongo { - -class DataFile; -class DataFileVersion; -class MmapV1RecordHeader; -class RecordFetcher; -class OperationContext; - -struct Extent; - -/** - * ExtentManager basics - * - one per database - * - responsible for managing <db>.# files - * - NOT responsible for .ns file - * - gives out extents - * - responsible for figuring out how to get a new extent - * - can use any method it wants to do so - * - this structure is NOT stored on disk - * - files will not be removed from the EM - * - extent size and loc are immutable - * - this class is thread safe, once constructed and init()-ialized - */ -class ExtentManager { - MONGO_DISALLOW_COPYING(ExtentManager); - -public: - ExtentManager() {} - - class Factory { - public: - virtual ~Factory() = default; - virtual std::unique_ptr<ExtentManager> create(StringData dbname, - StringData path, - bool directoryPerDB) = 0; - }; - - virtual ~ExtentManager() {} - - virtual void close(OperationContext* opCtx) = 0; - - /** - * opens all current files - */ - virtual Status init(OperationContext* opCtx) = 0; - - virtual int numFiles() const = 0; - virtual long long fileSize() const = 0; - - // must call Extent::reuse on the returned extent - virtual DiskLoc allocateExtent(OperationContext* opCtx, - bool capped, - int size, - bool enforceQuota) = 0; - - /** - * firstExt has to be == lastExt or a chain - */ - virtual void freeExtents(OperationContext* opCtx, DiskLoc firstExt, DiskLoc lastExt) = 0; - - /** - * frees a single extent - * ignores all fields in the Extent except: magic, myLoc, length - */ - virtual void freeExtent(OperationContext* opCtx, DiskLoc extent) = 0; - - /** - * Retrieve statistics on the the free list managed by this ExtentManger. - * @param numExtents - non-null pointer to an int that will receive the number of extents - * @param totalFreeSizeBytes - non-null pointer to an int64_t receiving the total free - * space in the free list. - */ - virtual void freeListStats(OperationContext* opCtx, - int* numExtents, - int64_t* totalFreeSizeBytes) const = 0; - - /** - * @param loc - has to be for a specific MmapV1RecordHeader - * Note(erh): this sadly cannot be removed. - * A MmapV1RecordHeader DiskLoc has an offset from a file, while a RecordStore really wants an - * offset from an extent. This intrinsically links an original record store to the original - * extent manager. - */ - virtual MmapV1RecordHeader* recordForV1(const DiskLoc& loc) const = 0; - - /** - * The extent manager tracks accesses to DiskLocs. This returns non-NULL if the DiskLoc has - * been recently accessed, and therefore has likely been paged into physical memory. - * Returns nullptr if the DiskLoc is Null. - * - */ - virtual std::unique_ptr<RecordFetcher> recordNeedsFetch(const DiskLoc& loc) const = 0; - - /** - * @param loc - has to be for a specific MmapV1RecordHeader (not an Extent) - * Note(erh) see comment on recordFor - */ - virtual Extent* extentForV1(const DiskLoc& loc) const = 0; - - /** - * @param loc - has to be for a specific MmapV1RecordHeader (not an Extent) - * Note(erh) see comment on recordFor - */ - virtual DiskLoc extentLocForV1(const DiskLoc& loc) const = 0; - - /** - * @param loc - has to be for a specific Extent - */ - virtual Extent* getExtent(const DiskLoc& loc, bool doSanityCheck = true) const = 0; - - /** - * @return maximum size of an Extent - */ - virtual int maxSize() const = 0; - - /** - * @return minimum size of an Extent - */ - virtual int minSize() const { - return 0x1000; - } - - /** - * @param recordLen length of record we need - * @param lastExt size of last extent which is a factor in next extent size - */ - virtual int followupSize(int recordLen, int lastExtentLen) const; - - /** get a suggested size for the first extent in a namespace - * @param recordLen length of record we need to insert - */ - virtual int initialSize(int recordLen) const; - - /** - * quantizes extent size to >= min + page boundary - */ - virtual int quantizeExtentSize(int size) const; - - // see cacheHint methods - enum HintType { Sequential, Random }; - class CacheHint { - public: - virtual ~CacheHint() {} - }; - /** - * Tell the system that for this extent, it will have this kind of disk access. - * Caller takes owernship of CacheHint - */ - virtual CacheHint* cacheHint(const DiskLoc& extentLoc, const HintType& hint) = 0; - - virtual DataFileVersion getFileFormat(OperationContext* opCtx) const = 0; - virtual void setFileFormat(OperationContext* opCtx, DataFileVersion newVersion) = 0; - - virtual const DataFile* getOpenFile(int n) const = 0; -}; - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/file_allocator.cpp b/src/mongo/db/storage/mmap_v1/file_allocator.cpp deleted file mode 100644 index daf9a13c659..00000000000 --- a/src/mongo/db/storage/mmap_v1/file_allocator.cpp +++ /dev/null @@ -1,492 +0,0 @@ -// @file file_allocator.cpp - -/* Copyright 2009 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects - * for all of the code used other than as permitted herein. If you modify - * file(s) with this exception, you may extend this exception to your - * version of the file(s), but you are not obligated to do so. If you do not - * wish to do so, delete this exception statement from your version. If you - * delete this exception statement from all source files in the program, - * then also delete it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/file_allocator.h" - -#include <boost/filesystem/operations.hpp> -#include <errno.h> -#include <fcntl.h> - -#if defined(__FreeBSD__) -#include <sys/mount.h> -#include <sys/param.h> -#endif - -#if defined(__linux__) -#include <sys/vfs.h> -#endif - -#if defined(_WIN32) -#include <io.h> -#endif - -#include "mongo/db/storage/mmap_v1/paths.h" -#include "mongo/platform/posix_fadvise.h" -#include "mongo/stdx/functional.h" -#include "mongo/stdx/thread.h" -#include "mongo/util/concurrency/idle_thread_block.h" -#include "mongo/util/concurrency/thread_name.h" -#include "mongo/util/fail_point.h" -#include "mongo/util/fail_point_service.h" -#include "mongo/util/log.h" -#include "mongo/util/mongoutils/str.h" -#include "mongo/util/processinfo.h" -#include "mongo/util/text.h" -#include "mongo/util/time_support.h" -#include "mongo/util/timer.h" - -using namespace mongoutils; - -#ifndef O_NOATIME -#define O_NOATIME (0) -#endif - -namespace mongo { - -using std::endl; -using std::list; -using std::string; -using std::stringstream; - -// unique number for temporary file names -unsigned long long FileAllocator::_uniqueNumber = 0; -static SimpleMutex _uniqueNumberMutex; - -MONGO_FAIL_POINT_DEFINE(allocateDiskFull); - -/** - * Aliases for Win32 CRT functions - */ -#if defined(_WIN32) -static inline long lseek(int fd, long offset, int origin) { - return _lseek(fd, offset, origin); -} -static inline int write(int fd, const void* data, int count) { - return _write(fd, data, count); -} -static inline int close(int fd) { - return _close(fd); -} - -typedef BOOL(CALLBACK* GetVolumeInformationByHandleWPtr)( - HANDLE, LPWSTR, DWORD, LPDWORD, LPDWORD, LPDWORD, LPWSTR, DWORD); -GetVolumeInformationByHandleWPtr GetVolumeInformationByHandleWFunc; - -MONGO_INITIALIZER(InitGetVolumeInformationByHandleW)(InitializerContext* context) { - HMODULE kernelLib = LoadLibraryA("kernel32.dll"); - if (kernelLib) { - GetVolumeInformationByHandleWFunc = reinterpret_cast<GetVolumeInformationByHandleWPtr>( - GetProcAddress(kernelLib, "GetVolumeInformationByHandleW")); - } - return Status::OK(); -} -#endif - -boost::filesystem::path ensureParentDirCreated(const boost::filesystem::path& p) { - const boost::filesystem::path parent = p.branch_path(); - - if (!boost::filesystem::exists(parent)) { - ensureParentDirCreated(parent); - log() << "creating directory " << parent.string() << endl; - boost::filesystem::create_directory(parent); - flushMyDirectory(parent); // flushes grandparent to ensure parent exists after crash - } - - verify(boost::filesystem::is_directory(parent)); - return parent; -} - -FileAllocator::FileAllocator() : _failed() {} - - -void FileAllocator::start() { - stdx::thread t([this] { run(this); }); - t.detach(); -} - -void FileAllocator::requestAllocation(const string& name, long& size) { - stdx::lock_guard<stdx::mutex> lk(_pendingMutex); - if (_failed) - return; - long oldSize = prevSize(name); - if (oldSize != -1) { - size = oldSize; - return; - } - _pending.push_back(name); - _pendingSize[name] = size; - _pendingUpdated.notify_all(); -} - -void FileAllocator::allocateAsap(const string& name, unsigned long long& size) { - stdx::unique_lock<stdx::mutex> lk(_pendingMutex); - - // In case the allocator is in failed state, check once before starting so that subsequent - // requests for the same database would fail fast after the first one has failed. - checkFailure(); - - long oldSize = prevSize(name); - if (oldSize != -1) { - size = oldSize; - if (!inProgress(name)) - return; - } - checkFailure(); - _pendingSize[name] = size; - if (_pending.size() == 0) - _pending.push_back(name); - else if (_pending.front() != name) { - _pending.remove(name); - list<string>::iterator i = _pending.begin(); - ++i; - _pending.insert(i, name); - } - _pendingUpdated.notify_all(); - while (inProgress(name)) { - checkFailure(); - _pendingUpdated.wait(lk); - } -} - -void FileAllocator::waitUntilFinished() const { - if (_failed) - return; - stdx::unique_lock<stdx::mutex> lk(_pendingMutex); - while (_pending.size() != 0) - _pendingUpdated.wait(lk); -} - -// TODO: pull this out to per-OS files once they exist -static bool useSparseFiles(int fd) { -#if defined(__linux__) || defined(__FreeBSD__) - struct statfs fs_stats; - int ret = fstatfs(fd, &fs_stats); - uassert(16062, "fstatfs failed: " + errnoWithDescription(), ret == 0); -#endif - -#if defined(__linux__) -// these are from <linux/magic.h> but that isn't available on all systems -#define NFS_SUPER_MAGIC 0x6969 -#define TMPFS_MAGIC 0x01021994 -#define ZFS_SUPER_MAGIC 0x2fc12fc1 - return (fs_stats.f_type == NFS_SUPER_MAGIC) || (fs_stats.f_type == TMPFS_MAGIC) || - (fs_stats.f_type == ZFS_SUPER_MAGIC); - -#elif defined(__FreeBSD__) - - return (str::equals(fs_stats.f_fstypename, "zfs") || - str::equals(fs_stats.f_fstypename, "nfs") || - str::equals(fs_stats.f_fstypename, "oldnfs")); - -#elif defined(__sun) - // assume using ZFS which is copy-on-write so no benefit to zero-filling - // TODO: check which fs we are using like we do elsewhere - return true; -#else - return false; -#endif -} - -#if defined(_WIN32) -static bool isFileOnNTFSVolume(int fd) { - if (!GetVolumeInformationByHandleWFunc) { - warning() << "Could not retrieve pointer to GetVolumeInformationByHandleW function"; - return false; - } - - HANDLE fileHandle = (HANDLE)_get_osfhandle(fd); - if (fileHandle == INVALID_HANDLE_VALUE) { - warning() << "_get_osfhandle() failed with " << _strerror(NULL); - return false; - } - - WCHAR fileSystemName[MAX_PATH + 1]; - if (!GetVolumeInformationByHandleWFunc( - fileHandle, NULL, 0, NULL, 0, NULL, fileSystemName, sizeof(fileSystemName))) { - DWORD gle = GetLastError(); - warning() << "GetVolumeInformationByHandleW failed with " << errnoWithDescription(gle); - return false; - } - - return lstrcmpW(fileSystemName, L"NTFS") == 0; -} -#endif - -void FileAllocator::ensureLength(int fd, long size) { - // Test running out of disk scenarios - if (MONGO_FAIL_POINT(allocateDiskFull)) { - uasserted(10444, "File allocation failed due to failpoint."); - } - -#if !defined(_WIN32) - if (useSparseFiles(fd)) { - LOG(1) << "using ftruncate to create a sparse file" << endl; - int ret = ftruncate(fd, size); - uassert(16063, "ftruncate failed: " + errnoWithDescription(), ret == 0); - return; - } -#endif - -#if defined(__linux__) - int ret = posix_fallocate(fd, 0, size); - if (ret == 0) { - LOG(1) << "used fallocate to create empty file"; - return; - } - - log() << "FileAllocator: posix_fallocate failed: " << errnoWithDescription(ret) - << " falling back" << endl; -#endif - - off_t filelen = lseek(fd, 0, SEEK_END); - if (filelen < size) { - if (filelen != 0) { - stringstream ss; - ss << "failure creating new datafile; lseek failed for fd " << fd - << " with errno: " << errnoWithDescription(); - uassert(10440, ss.str(), filelen == 0); - } - // Check for end of disk. - - uassert(10441, - str::stream() << "Unable to allocate new file of size " << size << ' ' - << errnoWithDescription(), - size - 1 == lseek(fd, size - 1, SEEK_SET)); - uassert(10442, - str::stream() << "Unable to allocate new file of size " << size << ' ' - << errnoWithDescription(), - 1 == write(fd, "", 1)); - - // File expansion is completed here. Do not do the zeroing out on OS-es where there - // is no risk of triggering allocation-related bugs such as - // http://support.microsoft.com/kb/2731284. - // - if (!ProcessInfo::isDataFileZeroingNeeded()) { - return; - } - -#if defined(_WIN32) - if (!isFileOnNTFSVolume(fd)) { - log() << "No need to zero out datafile on non-NTFS volume" << endl; - return; - } -#endif - - lseek(fd, 0, SEEK_SET); - - log() << "filling with zeroes..."; - const long z = 256 * 1024; - const std::unique_ptr<char[]> buf_holder(new char[z]); - char* buf = buf_holder.get(); - memset(buf, 0, z); - long left = size; - while (left > 0) { - long towrite = left; - if (towrite > z) - towrite = z; - - int written = write(fd, buf, towrite); - uassert(10443, errnoWithPrefix("FileAllocator: file write failed"), written > 0); - left -= written; - } - } -} - -void FileAllocator::checkFailure() { - if (_failed) { - // we want to log the problem (diskfull.js expects it) but we do not want to dump a stack - // trace - msgasserted(12520, "new file allocation failure"); - } -} - -long FileAllocator::prevSize(const string& name) const { - if (_pendingSize.count(name) > 0) - return _pendingSize[name]; - if (boost::filesystem::exists(name)) - return boost::filesystem::file_size(name); - return -1; -} - -// caller must hold _pendingMutex lock. -bool FileAllocator::inProgress(const string& name) const { - for (list<string>::const_iterator i = _pending.begin(); i != _pending.end(); ++i) - if (*i == name) - return true; - return false; -} - -string FileAllocator::makeTempFileName(boost::filesystem::path root) { - while (1) { - boost::filesystem::path p = root / "_tmp"; - stringstream ss; - unsigned long long thisUniqueNumber; - { - // increment temporary file name counter - // TODO: SERVER-6055 -- Unify temporary file name selection - stdx::lock_guard<SimpleMutex> lk(_uniqueNumberMutex); - thisUniqueNumber = _uniqueNumber; - ++_uniqueNumber; - } - ss << thisUniqueNumber; - p /= ss.str(); - string fn = p.string(); - if (!boost::filesystem::exists(p)) - return fn; - } - return ""; -} - -void FileAllocator::run(FileAllocator* fa) { - setThreadName("FileAllocator"); - { - // initialize unique temporary file name counter - // TODO: SERVER-6055 -- Unify temporary file name selection - stdx::lock_guard<SimpleMutex> lk(_uniqueNumberMutex); - _uniqueNumber = curTimeMicros64(); - } - while (1) { - { - stdx::unique_lock<stdx::mutex> lk(fa->_pendingMutex); - if (fa->_pending.size() == 0) { - MONGO_IDLE_THREAD_BLOCK; - fa->_pendingUpdated.wait(lk); - } - } - while (1) { - string name; - long size = 0; - { - stdx::lock_guard<stdx::mutex> lk(fa->_pendingMutex); - if (fa->_pending.size() == 0) - break; - name = fa->_pending.front(); - size = fa->_pendingSize[name]; - } - - string tmp; - long fd = 0; - try { - log() << "allocating new datafile " << name; - - boost::filesystem::path parent = ensureParentDirCreated(name); - tmp = fa->makeTempFileName(parent); - ensureParentDirCreated(tmp); - -#if defined(_WIN32) - fd = _wopen(toNativeString(tmp.c_str()).c_str(), - _O_RDWR | _O_CREAT | O_NOATIME, - _S_IREAD | _S_IWRITE); -#else - fd = open(tmp.c_str(), O_CREAT | O_RDWR | O_NOATIME, S_IRUSR | S_IWUSR); -#endif - if (fd < 0) { - log() << "FileAllocator: couldn't create " << name << " (" << tmp << ") " - << errnoWithDescription() << endl; - uasserted(10439, ""); - } - -#if defined(POSIX_FADV_DONTNEED) - if (posix_fadvise(fd, 0, size, POSIX_FADV_DONTNEED)) { - log() << "warning: posix_fadvise fails " << name << " (" << tmp << ") " - << errnoWithDescription() << endl; - } -#endif - - Timer t; - - /* make sure the file is the full desired length */ - ensureLength(fd, size); - - close(fd); - fd = 0; - - boost::system::error_code ec; - boost::filesystem::rename(tmp.c_str(), name.c_str(), ec); - if (ec) { - const string& errMessage = str::stream() << "error: couldn't rename " << tmp - << " to " << name << ' ' - << ec.message(); - msgasserted(13653, errMessage); - } - - flushMyDirectory(name); - - log() << "done allocating datafile " << name << ", " - << "size: " << size / 1024 / 1024 << "MB, " - << " took " << ((double)t.millis()) / 1000.0 << " secs" << endl; - - // no longer in a failed state. allow new writers. - fa->_failed = false; - } catch (const std::exception& e) { - log() << "error: failed to allocate new file: " << name << " size: " << size << ' ' - << e.what() << ". will try again in 10 seconds" << endl; - if (fd > 0) - close(fd); - try { - if (!tmp.empty()) - boost::filesystem::remove(tmp); - boost::filesystem::remove(name); - } catch (const std::exception& e) { - log() << "error removing files: " << e.what() << endl; - } - - { - stdx::lock_guard<stdx::mutex> lk(fa->_pendingMutex); - fa->_failed = true; - - // TODO: Should we remove the file from pending? - fa->_pendingUpdated.notify_all(); - } - - - sleepsecs(10); - continue; - } - - { - stdx::lock_guard<stdx::mutex> lk(fa->_pendingMutex); - fa->_pendingSize.erase(name); - fa->_pending.pop_front(); - fa->_pendingUpdated.notify_all(); - } - } - } -} - -FileAllocator* FileAllocator::get() { - static FileAllocator instance; - return &instance; -} - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/file_allocator.h b/src/mongo/db/storage/mmap_v1/file_allocator.h deleted file mode 100644 index 589cf908dc0..00000000000 --- a/src/mongo/db/storage/mmap_v1/file_allocator.h +++ /dev/null @@ -1,105 +0,0 @@ -// @file file_allocator.h - -/* Copyright 2009 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects - * for all of the code used other than as permitted herein. If you modify - * file(s) with this exception, you may extend this exception to your - * version of the file(s), but you are not obligated to do so. If you do not - * wish to do so, delete this exception statement from your version. If you - * delete this exception statement from all source files in the program, - * then also delete it in the license file. - */ - -#include <boost/filesystem/path.hpp> -#include <list> -#include <map> - -#include "mongo/stdx/condition_variable.h" -#include "mongo/stdx/mutex.h" -#include "mongo/util/concurrency/mutex.h" - -namespace mongo { - -/* - * Handles allocation of contiguous files on disk. Allocation may be - * requested asynchronously or synchronously. - * singleton - */ -class FileAllocator { - MONGO_DISALLOW_COPYING(FileAllocator); - /* - * The public functions may not be called concurrently. The allocation - * functions may be called multiple times per file, but only the first - * size specified per file will be used. - */ -public: - void start(); - - /** - * May be called if file exists. If file exists, or its allocation has - * been requested, size is updated to match existing file size. - */ - void requestAllocation(const std::string& name, long& size); - - - /** - * Returns when file has been allocated. If file exists, size is - * updated to match existing file size. - */ - void allocateAsap(const std::string& name, unsigned long long& size); - - void waitUntilFinished() const; - - static void ensureLength(int fd, long size); - - /** @return the singleton */ - static FileAllocator* get(); - -private: - FileAllocator(); - - void checkFailure(); - - // caller must hold pendingMutex_ lock. Returns size if allocated or - // allocation requested, -1 otherwise. - long prevSize(const std::string& name) const; - - // caller must hold pendingMutex_ lock. - bool inProgress(const std::string& name) const; - - /** called from the worked thread */ - static void run(FileAllocator* fa); - - // generate a unique name for temporary files - std::string makeTempFileName(boost::filesystem::path root); - - mutable stdx::mutex _pendingMutex; - mutable stdx::condition_variable _pendingUpdated; - - std::list<std::string> _pending; - mutable std::map<std::string, long> _pendingSize; - - // unique number for temporary files - static unsigned long long _uniqueNumber; - - bool _failed; -}; - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/heap_record_store_btree.cpp b/src/mongo/db/storage/mmap_v1/heap_record_store_btree.cpp deleted file mode 100644 index 093808ea9c8..00000000000 --- a/src/mongo/db/storage/mmap_v1/heap_record_store_btree.cpp +++ /dev/null @@ -1,164 +0,0 @@ -// heap_record_store_btree.cpp - -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/heap_record_store_btree.h" - -#include "mongo/base/checked_cast.h" -#include "mongo/db/operation_context.h" -#include "mongo/util/log.h" -#include "mongo/util/mongoutils/str.h" - -namespace mongo { - -RecordData HeapRecordStoreBtree::dataFor(OperationContext* opCtx, const RecordId& loc) const { - Records::const_iterator it = _records.find(loc); - invariant(it != _records.end()); - const MmapV1RecordHeader& rec = it->second; - - return RecordData(rec.data.get(), rec.dataSize); -} - -bool HeapRecordStoreBtree::findRecord(OperationContext* opCtx, - const RecordId& loc, - RecordData* out) const { - Records::const_iterator it = _records.find(loc); - if (it == _records.end()) - return false; - const MmapV1RecordHeader& rec = it->second; - *out = RecordData(rec.data.get(), rec.dataSize); - return true; -} - -void HeapRecordStoreBtree::deleteRecord(OperationContext* opCtx, const RecordId& loc) { - invariant(_records.erase(loc) == 1); -} - -StatusWith<RecordId> HeapRecordStoreBtree::insertRecord( - OperationContext* opCtx, const char* data, int len, Timestamp, bool enforceQuota) { - MmapV1RecordHeader rec(len); - memcpy(rec.data.get(), data, len); - - const RecordId loc = allocateLoc(); - _records[loc] = rec; - - HeapRecordStoreBtreeRecoveryUnit::notifyInsert(opCtx, this, loc); - - return StatusWith<RecordId>(loc); -} - -Status HeapRecordStoreBtree::insertRecordsWithDocWriter(OperationContext* opCtx, - const DocWriter* const* docs, - const Timestamp*, - size_t nDocs, - RecordId* idsOut) { - // This class is only for unit tests of the mmapv1 btree code and this is how it is called. - // If that ever changes, this class will need to be fixed. - invariant(nDocs == 1); - invariant(idsOut); - - MmapV1RecordHeader rec(docs[0]->documentSize()); - docs[0]->writeDocument(rec.data.get()); - - const RecordId loc = allocateLoc(); - _records[loc] = rec; - *idsOut = loc; - - HeapRecordStoreBtreeRecoveryUnit::notifyInsert(opCtx, this, loc); - - return Status::OK(); -} - -RecordId HeapRecordStoreBtree::allocateLoc() { - const int64_t id = _nextId++; - // This is a hack, but both the high and low order bits of RecordId offset must be 0, and the - // file must fit in 23 bits. This gives us a total of 30 + 23 == 53 bits. - invariant(id < (1LL << 53)); - RecordId dl(int(id >> 30), int((id << 1) & ~(1 << 31))); - invariant((dl.repr() & 0x1) == 0); - return dl; -} - -Status HeapRecordStoreBtree::touch(OperationContext* opCtx, BSONObjBuilder* output) const { - // not currently called from the tests, but called from btree_logic.h - return Status::OK(); -} - -// --------------------------- - -void HeapRecordStoreBtreeRecoveryUnit::commitUnitOfWork() { - _insertions.clear(); - _mods.clear(); -} - -void HeapRecordStoreBtreeRecoveryUnit::abortUnitOfWork() { - // reverse in case we write same area twice - for (size_t i = _mods.size(); i > 0; i--) { - ModEntry& e = _mods[i - 1]; - memcpy(e.data, e.old.get(), e.len); - } - - invariant(_insertions.size() == 0); // todo -} - -void* HeapRecordStoreBtreeRecoveryUnit::writingPtr(void* data, size_t len) { - ModEntry e = {data, len, boost::shared_array<char>(new char[len])}; - memcpy(e.old.get(), data, len); - _mods.push_back(e); - return data; -} - -void HeapRecordStoreBtreeRecoveryUnit::notifyInsert(HeapRecordStoreBtree* rs, const RecordId& loc) { - InsertEntry e = {rs, loc}; - _insertions.push_back(e); -} - -void HeapRecordStoreBtreeRecoveryUnit::notifyInsert(OperationContext* ctx, - HeapRecordStoreBtree* rs, - const RecordId& loc) { - if (!ctx) - return; - - // This dynamic_cast has semantics, should change ideally. - HeapRecordStoreBtreeRecoveryUnit* ru = - dynamic_cast<HeapRecordStoreBtreeRecoveryUnit*>(ctx->recoveryUnit()); - - if (!ru) - return; - - ru->notifyInsert(rs, loc); -} - - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/heap_record_store_btree.h b/src/mongo/db/storage/mmap_v1/heap_record_store_btree.h deleted file mode 100644 index 4095a910115..00000000000 --- a/src/mongo/db/storage/mmap_v1/heap_record_store_btree.h +++ /dev/null @@ -1,237 +0,0 @@ -// heap_record_store_btree.h - -/** -* Copyright (C) 2014 MongoDB Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include <boost/shared_array.hpp> -#include <map> - -#include "mongo/db/storage/record_store.h" -#include "mongo/db/storage/recovery_unit.h" - -namespace mongo { - -/** - * A RecordStore that stores all data on the heap. This implementation contains only the - * functionality necessary to test btree. - */ -class HeapRecordStoreBtree : public RecordStore { - struct MmapV1RecordHeader; - -public: - const std::string& getIdent() const override { - MONGO_UNREACHABLE; - } - - // RecordId(0,0) isn't valid for records. - explicit HeapRecordStoreBtree(StringData ns) : RecordStore(ns), _nextId(1) {} - - virtual RecordData dataFor(OperationContext* opCtx, const RecordId& loc) const; - - virtual bool findRecord(OperationContext* opCtx, const RecordId& loc, RecordData* out) const; - - virtual void deleteRecord(OperationContext* opCtx, const RecordId& dl); - - virtual StatusWith<RecordId> insertRecord( - OperationContext* opCtx, const char* data, int len, Timestamp, bool enforceQuota); - - virtual Status insertRecordsWithDocWriter(OperationContext* opCtx, - const DocWriter* const* docs, - const Timestamp*, - size_t nDocs, - RecordId* idsOut); - - virtual long long numRecords(OperationContext* opCtx) const { - return _records.size(); - } - - virtual Status touch(OperationContext* opCtx, BSONObjBuilder* output) const; - - // public methods below here are not necessary to test btree, and will crash when called. - - // ------------------------------ - - virtual Status updateRecord(OperationContext* opCtx, - const RecordId& oldLocation, - const char* data, - int len, - bool enforceQuota, - UpdateNotifier* notifier) { - MONGO_UNREACHABLE; - } - - virtual bool updateWithDamagesSupported() const { - return true; - } - - virtual StatusWith<RecordData> updateWithDamages(OperationContext* opCtx, - const RecordId& loc, - const RecordData& oldRec, - const char* damageSource, - const mutablebson::DamageVector& damages) { - MONGO_UNREACHABLE; - } - - std::unique_ptr<SeekableRecordCursor> getCursor(OperationContext* opCtx, - bool forward) const final { - MONGO_UNREACHABLE; - } - - - virtual Status truncate(OperationContext* opCtx) { - MONGO_UNREACHABLE; - } - - virtual void cappedTruncateAfter(OperationContext* opCtx, RecordId end, bool inclusive) { - MONGO_UNREACHABLE; - } - - virtual bool compactSupported() const { - MONGO_UNREACHABLE; - } - - virtual Status validate(OperationContext* opCtx, - ValidateCmdLevel level, - ValidateAdaptor* adaptor, - ValidateResults* results, - BSONObjBuilder* output) { - MONGO_UNREACHABLE; - } - - virtual void appendCustomStats(OperationContext* opCtx, - BSONObjBuilder* result, - double scale) const { - MONGO_UNREACHABLE; - } - - virtual void increaseStorageSize(OperationContext* opCtx, int size, bool enforceQuota) { - MONGO_UNREACHABLE; - } - - virtual int64_t storageSize(OperationContext* opCtx, - BSONObjBuilder* extraInfo = NULL, - int infoLevel = 0) const { - MONGO_UNREACHABLE; - } - - virtual long long dataSize(OperationContext* opCtx) const { - MONGO_UNREACHABLE; - } - - virtual MmapV1RecordHeader* recordFor(const RecordId& loc) const { - MONGO_UNREACHABLE; - } - - virtual bool isCapped() const { - MONGO_UNREACHABLE; - } - - virtual const char* name() const { - MONGO_UNREACHABLE; - } - - void waitForAllEarlierOplogWritesToBeVisible(OperationContext* opCtx) const override { - MONGO_UNREACHABLE; - } - - virtual void updateStatsAfterRepair(OperationContext* opCtx, - long long numRecords, - long long dataSize) { - MONGO_UNREACHABLE; - } - // more things that we actually care about below - -private: - struct MmapV1RecordHeader { - MmapV1RecordHeader() : dataSize(-1), data() {} - explicit MmapV1RecordHeader(int size) : dataSize(size), data(new char[size]) {} - - int dataSize; - boost::shared_array<char> data; - }; - - RecordId allocateLoc(); - - typedef std::map<RecordId, HeapRecordStoreBtree::MmapV1RecordHeader> Records; - Records _records; - int64_t _nextId; -}; - -/** - * A RecoveryUnit for HeapRecordStoreBtree, this is for testing btree only. - */ -class HeapRecordStoreBtreeRecoveryUnit : public RecoveryUnit { -public: - void beginUnitOfWork(OperationContext* opCtx) final{}; - void commitUnitOfWork() final; - void abortUnitOfWork() final; - - virtual bool waitUntilDurable() { - return true; - } - - virtual void abandonSnapshot() {} - - virtual void registerChange(Change* change) { - change->commit(boost::none); - delete change; - } - - virtual void* writingPtr(void* data, size_t len); - - virtual void setRollbackWritesDisabled() {} - - virtual SnapshotId getSnapshotId() const { - return SnapshotId(); - } - - virtual void setOrderedCommit(bool orderedCommit) {} - - // ----------------------- - - void notifyInsert(HeapRecordStoreBtree* rs, const RecordId& loc); - static void notifyInsert(OperationContext* ctx, HeapRecordStoreBtree* rs, const RecordId& loc); - -private: - struct InsertEntry { - HeapRecordStoreBtree* rs; - RecordId loc; - }; - std::vector<InsertEntry> _insertions; - - struct ModEntry { - void* data; - size_t len; - boost::shared_array<char> old; - }; - std::vector<ModEntry> _mods; -}; - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/journal_latency_test_cmd.cpp b/src/mongo/db/storage/mmap_v1/journal_latency_test_cmd.cpp deleted file mode 100644 index 8807dfbb064..00000000000 --- a/src/mongo/db/storage/mmap_v1/journal_latency_test_cmd.cpp +++ /dev/null @@ -1,155 +0,0 @@ -/** - * Copyright (C) 2012 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/platform/basic.h" - -#include <boost/filesystem/convenience.hpp> -#include <boost/filesystem/operations.hpp> -#include <string> -#include <vector> - -#include "mongo/base/init.h" -#include "mongo/base/status.h" -#include "mongo/db/auth/action_set.h" -#include "mongo/db/auth/action_type.h" -#include "mongo/db/auth/privilege.h" -#include "mongo/db/commands.h" -#include "mongo/db/commands/test_commands_enabled.h" -#include "mongo/db/curop.h" -#include "mongo/db/index/index_access_method.h" -#include "mongo/db/index/index_descriptor.h" -#include "mongo/db/jsobj.h" -#include "mongo/db/query/internal_plans.h" -#include "mongo/db/storage/mmap_v1/aligned_builder.h" -#include "mongo/db/storage/mmap_v1/logfile.h" -#include "mongo/db/storage/mmap_v1/paths.h" -#include "mongo/db/storage/storage_options.h" -#include "mongo/scripting/engine.h" -#include "mongo/util/background.h" -#include "mongo/util/timer.h" - -namespace mongo { - -using std::max; -using std::min; -using std::string; -using std::stringstream; - -namespace dur { -boost::filesystem::path getJournalDir(); -} - -// Testing-only, enabled via command line -class JournalLatencyTestCmd : public BasicCommand { -public: - JournalLatencyTestCmd() : BasicCommand("journalLatencyTest") {} - - AllowedOnSecondary secondaryAllowed(ServiceContext*) const override { - return AllowedOnSecondary::kAlways; - } - virtual bool supportsWriteConcern(const BSONObj& cmd) const override { - return false; - } - virtual bool adminOnly() const { - return true; - } - std::string help() const override { - return "test how long to write and fsync to a test file in the journal/ directory"; - } - // No auth needed because it only works when enabled via command line. - virtual void addRequiredPrivileges(const std::string& dbname, - const BSONObj& cmdObj, - std::vector<Privilege>* out) const {} - bool run(OperationContext* opCtx, - const string& dbname, - const BSONObj& cmdObj, - BSONObjBuilder& result) { - boost::filesystem::path p = dur::getJournalDir(); - p /= "journalLatencyTest"; - - // remove file if already present - try { - boost::filesystem::remove(p); - } catch (...) { - } - - BSONObjBuilder bb[2]; - for (int pass = 0; pass < 2; pass++) { - LogFile f(p.string()); - AlignedBuilder b(1024 * 1024); - { - Timer t; - for (int i = 0; i < 100; i++) { - f.synchronousAppend(b.buf(), 8192); - } - bb[pass].append("8KB", t.millis() / 100.0); - } - { - const int N = 50; - Timer t2; - long long x = 0; - for (int i = 0; i < N; i++) { - Timer t; - f.synchronousAppend(b.buf(), 8192); - x += t.micros(); - sleepmillis(4); - } - long long y = t2.micros() - 4 * N * 1000; - // not really trusting the timer granularity on all platforms so whichever is higher - // of x and y - bb[pass].append("8KBWithPauses", max(x, y) / (N * 1000.0)); - } - { - Timer t; - for (int i = 0; i < 20; i++) { - f.synchronousAppend(b.buf(), 1024 * 1024); - } - bb[pass].append("1MB", t.millis() / 20.0); - } - // second time around, we are prealloced. - } - result.append("timeMillis", bb[0].obj()); - result.append("timeMillisWithPrealloc", bb[1].obj()); - - try { - remove(p); - } catch (...) { - } - - try { - result.append( - "onSamePartition", - onSamePartition(dur::getJournalDir().string(), storageGlobalParams.dbpath)); - } catch (...) { - } - - return 1; - } -}; -MONGO_REGISTER_TEST_COMMAND(JournalLatencyTestCmd); -} diff --git a/src/mongo/db/storage/mmap_v1/logfile.cpp b/src/mongo/db/storage/mmap_v1/logfile.cpp deleted file mode 100644 index 98cfabc1f75..00000000000 --- a/src/mongo/db/storage/mmap_v1/logfile.cpp +++ /dev/null @@ -1,272 +0,0 @@ -// @file logfile.cpp simple file log writing / journaling - -/** -* Copyright (C) 2008 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects -* for all of the code used other than as permitted herein. If you modify -* file(s) with this exception, you may extend this exception to your -* version of the file(s), but you are not obligated to do so. If you do not -* wish to do so, delete this exception statement from your version. If you -* delete this exception statement from all source files in the program, -* then also delete it in the license file. -*/ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kControl - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/logfile.h" - -#include "mongo/db/storage/mmap_v1/mmap.h" -#include "mongo/db/storage/mmap_v1/paths.h" -#include "mongo/platform/posix_fadvise.h" -#include "mongo/util/allocator.h" -#include "mongo/util/log.h" -#include "mongo/util/mongoutils/str.h" -#include "mongo/util/startup_test.h" -#include "mongo/util/text.h" - - -using namespace mongoutils; - -using std::endl; -using std::string; - -#if defined(_WIN32) - -namespace mongo { - -LogFile::LogFile(const std::string& name, bool readwrite) : _name(name) { - _fd = CreateFile(toNativeString(name.c_str()).c_str(), - (readwrite ? GENERIC_READ : 0) | GENERIC_WRITE, - FILE_SHARE_READ, - NULL, - OPEN_ALWAYS, - FILE_FLAG_NO_BUFFERING, - NULL); - if (_fd == INVALID_HANDLE_VALUE) { - DWORD e = GetLastError(); - uasserted(13518, - str::stream() << "couldn't open file " << name << " for writing " - << errnoWithDescription(e)); - } - SetFilePointer(_fd, 0, 0, FILE_BEGIN); -} - -LogFile::~LogFile() { - if (_fd != INVALID_HANDLE_VALUE) - CloseHandle(_fd); -} - -void LogFile::truncate() { - verify(_fd != INVALID_HANDLE_VALUE); - - if (!SetEndOfFile(_fd)) { - msgasserted(15871, "Couldn't truncate file: " + errnoWithDescription()); - } -} - -void LogFile::writeAt(unsigned long long offset, const void* _buf, size_t _len) { - // TODO 64 bit offsets - OVERLAPPED o; - memset(&o, 0, sizeof(o)); - (unsigned long long&)o.Offset = offset; - BOOL ok = WriteFile(_fd, _buf, _len, 0, &o); - verify(ok); -} - -void LogFile::readAt(unsigned long long offset, void* _buf, size_t _len) { - // TODO 64 bit offsets - OVERLAPPED o; - memset(&o, 0, sizeof(o)); - (unsigned long long&)o.Offset = offset; - DWORD nr; - BOOL ok = ReadFile(_fd, _buf, _len, &nr, &o); - if (!ok) { - string e = errnoWithDescription(); - // DWORD e = GetLastError(); - log() << "LogFile readAt(" << offset << ") len:" << _len << "errno:" << e << endl; - verify(false); - } -} - -void LogFile::synchronousAppend(const void* _buf, size_t _len) { - const size_t BlockSize = 8 * 1024 * 1024; - verify(_fd); - verify(_len % minDirectIOSizeBytes == 0); - const char* buf = (const char*)_buf; - size_t left = _len; - while (left) { - size_t toWrite = std::min(left, BlockSize); - DWORD written; - if (!WriteFile(_fd, buf, toWrite, &written, NULL)) { - DWORD e = GetLastError(); - if (e == 87) - msgasserted(13519, "error 87 appending to file - invalid parameter"); - else - uasserted(13517, - str::stream() << "error appending to file " << _name << ' ' << _len << ' ' - << toWrite - << ' ' - << errnoWithDescription(e)); - } else { - dassert(written == toWrite); - } - left -= written; - buf += written; - } -} -} - -#else - -/// posix - -#include <fcntl.h> -#include <sys/ioctl.h> -#include <sys/stat.h> -#include <sys/types.h> - -#ifdef __linux__ -#include <linux/fs.h> -#endif - -namespace mongo { - -LogFile::LogFile(const std::string& name, bool readwrite) : _name(name) { - int options = O_CREAT | (readwrite ? O_RDWR : O_WRONLY) -#if defined(O_DIRECT) - | O_DIRECT -#endif -#if defined(O_NOATIME) - | O_NOATIME -#endif - ; - - _fd = open(name.c_str(), options, S_IRUSR | S_IWUSR); - _blkSize = minDirectIOSizeBytes; - -#if defined(O_DIRECT) - _direct = true; - if (_fd < 0) { - _direct = false; - options &= ~O_DIRECT; - _fd = open(name.c_str(), options, S_IRUSR | S_IWUSR); - } -#ifdef __linux__ - ssize_t tmpBlkSize = ioctl(_fd, BLKBSZGET); - // TODO: We need some sanity checking on tmpBlkSize even if ioctl() did not fail. - if (tmpBlkSize > 0) { - _blkSize = (size_t)tmpBlkSize; - } -#endif -#else - _direct = false; -#endif - - if (_fd < 0) { - uasserted(13516, - str::stream() << "couldn't open file " << name << " for writing " - << errnoWithDescription()); - } - - flushMyDirectory(name); -} - -LogFile::~LogFile() { - if (_fd >= 0) - close(_fd); - _fd = -1; -} - -void LogFile::truncate() { - verify(_fd >= 0); - - MONGO_STATIC_ASSERT(sizeof(off_t) == 8); // we don't want overflow here - const off_t pos = lseek(_fd, 0, SEEK_CUR); // doesn't actually seek - if (ftruncate(_fd, pos) != 0) { - msgasserted(15873, "Couldn't truncate file: " + errnoWithDescription()); - } - - fsync(_fd); -} - -void LogFile::writeAt(unsigned long long offset, const void* buf, size_t len) { - verify(((size_t)buf) % minDirectIOSizeBytes == 0); // aligned - ssize_t written = pwrite(_fd, buf, len, offset); - if (written != (ssize_t)len) { - log() << "writeAt fails " << errnoWithDescription() << endl; - } -#if defined(__linux__) - fdatasync(_fd); -#else - fsync(_fd); -#endif -} - -void LogFile::readAt(unsigned long long offset, void* _buf, size_t _len) { - verify(((size_t)_buf) % minDirectIOSizeBytes == 0); // aligned - ssize_t rd = pread(_fd, _buf, _len, offset); - verify(rd != -1); -} - -void LogFile::synchronousAppend(const void* b, size_t len) { - const char* buf = static_cast<const char*>(b); - ssize_t charsToWrite = static_cast<ssize_t>(len); - - fassert(16144, charsToWrite >= 0); - fassert(16142, _fd >= 0); - fassert(16143, reinterpret_cast<size_t>(buf) % _blkSize == 0); // aligned - -#ifdef POSIX_FADV_DONTNEED - const off_t pos = lseek(_fd, 0, SEEK_CUR); // doesn't actually seek, just get current position -#endif - - while (charsToWrite > 0) { - const ssize_t written = write(_fd, buf, static_cast<size_t>(charsToWrite)); - if (-1 == written) { - log() << "LogFile::synchronousAppend failed with " << charsToWrite - << " bytes unwritten out of " << len << " bytes; b=" << b << ' ' - << errnoWithDescription() << std::endl; - fassertFailed(13515); - } - buf += written; - charsToWrite -= written; - } - - if ( -#if defined(__linux__) - fdatasync(_fd) < 0 -#else - fsync(_fd) -#endif - ) { - log() << "error appending to file on fsync " << ' ' << errnoWithDescription(); - fassertFailed(13514); - } - -#ifdef POSIX_FADV_DONTNEED - if (!_direct && pos >= 0) // current position cannot be negative - posix_fadvise(_fd, pos, len, POSIX_FADV_DONTNEED); -#endif -} -} - -#endif diff --git a/src/mongo/db/storage/mmap_v1/logfile.h b/src/mongo/db/storage/mmap_v1/logfile.h deleted file mode 100644 index dbb83cf2a2e..00000000000 --- a/src/mongo/db/storage/mmap_v1/logfile.h +++ /dev/null @@ -1,83 +0,0 @@ -// @file logfile.h simple file log writing / journaling - -/** -* Copyright (C) 2010 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects -* for all of the code used other than as permitted herein. If you modify -* file(s) with this exception, you may extend this exception to your -* version of the file(s), but you are not obligated to do so. If you do not -* wish to do so, delete this exception statement from your version. If you -* delete this exception statement from all source files in the program, -* then also delete it in the license file. -*/ - -#pragma once - -#include <string> - - -namespace mongo { - -class LogFile { -public: - /** create the file and open. must not already exist. - throws UserAssertion on i/o error - */ - LogFile(const std::string& name, bool readwrite = false); - - /** closes */ - ~LogFile(); - - /** append to file. does not return until sync'd. uses direct i/o when possible. - throws UserAssertion on an i/o error - note direct i/o may have alignment requirements - */ - void synchronousAppend(const void* buf, size_t len); - - /** write at specified offset. must be aligned. noreturn until physically written. thread safe - * */ - void writeAt(unsigned long long offset, const void* _bug, size_t _len); - - void readAt(unsigned long long offset, void* _buf, size_t _len); - - const std::string _name; - - void truncate(); // Removes extra data after current position - -private: - // Originally disks had a sector size of 512 bytes, after Advanced Format disks were deployed in - // 2011, the default minimium size became 4096. - // The direct io size is based on the physical disk sector, not the VM page size. - const size_t minDirectIOSizeBytes = 4096; - -private: -#if defined(_WIN32) - typedef HANDLE fd_type; -#else - typedef int fd_type; -#endif - fd_type _fd; - bool _direct; // are we using direct I/O - - // Block size, in case of direct I/O we need to test alignment against the page size, - // which can be different than 4kB. - size_t _blkSize; -}; -} diff --git a/src/mongo/db/storage/mmap_v1/mmap.cpp b/src/mongo/db/storage/mmap_v1/mmap.cpp deleted file mode 100644 index f8d12295ce3..00000000000 --- a/src/mongo/db/storage/mmap_v1/mmap.cpp +++ /dev/null @@ -1,252 +0,0 @@ -// mmap.cpp - -/* Copyright 2009 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects - * for all of the code used other than as permitted herein. If you modify - * file(s) with this exception, you may extend this exception to your - * version of the file(s), but you are not obligated to do so. If you do not - * wish to do so, delete this exception statement from your version. If you - * delete this exception statement from all source files in the program, - * then also delete it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kControl - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/mmap.h" - -#include <boost/filesystem/operations.hpp> - -#include "mongo/base/owned_pointer_vector.h" -#include "mongo/db/client.h" -#include "mongo/db/concurrency/locker.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/storage/storage_options.h" -#include "mongo/util/log.h" -#include "mongo/util/map_util.h" -#include "mongo/util/mongoutils/str.h" -#include "mongo/util/processinfo.h" -#include "mongo/util/progress_meter.h" -#include "mongo/util/startup_test.h" - -namespace mongo { - -using std::endl; -using std::map; -using std::set; -using std::string; -using std::stringstream; -using std::vector; - -void minOSPageSizeBytesTest(size_t minOSPageSizeBytes) { - fassert(16325, minOSPageSizeBytes > 0); - fassert(16326, minOSPageSizeBytes < 1000000); - // check to see if the page size is a power of 2 - fassert(16327, (minOSPageSizeBytes & (minOSPageSizeBytes - 1)) == 0); -} - -namespace { -set<MongoFile*> mmfiles; -map<string, MongoFile*> pathToFile; -mongo::AtomicUInt64 mmfNextId(0); -} // namespace - -MemoryMappedFile::MemoryMappedFile(OperationContext* opCtx, OptionSet options) - : MongoFile(options), _uniqueId(mmfNextId.fetchAndAdd(1)) { - created(opCtx); -} - -MemoryMappedFile::~MemoryMappedFile() { - invariant(isClosed()); - - auto opCtx = cc().getOperationContext(); - invariant(opCtx); - - LockMongoFilesShared lock(opCtx); - for (std::set<MongoFile*>::const_iterator it = mmfiles.begin(); it != mmfiles.end(); it++) { - invariant(*it != this); - } -} - -/*static*/ AtomicUInt64 MemoryMappedFile::totalMappedLength; - -void* MemoryMappedFile::create(OperationContext* opCtx, - const std::string& filename, - unsigned long long len, - bool zero) { - uassert(13468, - string("can't create file already exists ") + filename, - !boost::filesystem::exists(filename)); - void* p = map(opCtx, filename.c_str(), len); - fassert(16331, p); - if (zero) { - size_t sz = (size_t)len; - verify(len == sz); - memset(p, 0, sz); - } - return p; -} - -/*static*/ void MemoryMappedFile::updateLength(const char* filename, unsigned long long& length) { - if (!boost::filesystem::exists(filename)) - return; - // make sure we map full length if preexisting file. - boost::uintmax_t l = boost::filesystem::file_size(filename); - length = l; -} - -void* MemoryMappedFile::map(OperationContext* opCtx, const char* filename) { - unsigned long long l; - try { - l = boost::filesystem::file_size(filename); - } catch (boost::filesystem::filesystem_error& e) { - uasserted(15922, - mongoutils::str::stream() << "couldn't get file length when opening mapping " - << filename - << ' ' - << e.what()); - } - - void* ret = map(opCtx, filename, l); - fassert(16334, ret); - return ret; -} - -/* --- MongoFile ------------------------------------------------- - this is the administrative stuff -*/ - -MongoFile::MongoFile(OptionSet options) - : _options(storageGlobalParams.readOnly ? (options | READONLY) : options) {} - - -Lock::ResourceMutex LockMongoFilesShared::mmmutex("MMapMutex"); -unsigned LockMongoFilesShared::era = 99; // note this rolls over - -set<MongoFile*>& MongoFile::getAllFiles() { - return mmfiles; -} - -/* subclass must call in destructor (or at close). - removes this from pathToFile and other maps - safe to call more than once, albeit might be wasted work - ideal to call close to the close, if the close is well before object destruction -*/ -void MongoFile::destroyed(OperationContext* opCtx) { - LockMongoFilesShared::assertExclusivelyLocked(opCtx); - mmfiles.erase(this); - pathToFile.erase(filename()); -} - -/*static*/ -void MongoFile::closeAllFiles(OperationContext* opCtx, stringstream& message) { - static int closingAllFiles = 0; - if (closingAllFiles) { - message << "warning closingAllFiles=" << closingAllFiles << endl; - return; - } - ++closingAllFiles; - - LockMongoFilesExclusive lk(opCtx); - - ProgressMeter pm(mmfiles.size(), 2, 1, "files", "File Closing Progress"); - set<MongoFile*> temp = mmfiles; - for (set<MongoFile*>::iterator i = temp.begin(); i != temp.end(); i++) { - (*i)->close(opCtx); // close() now removes from mmfiles - pm.hit(); - } - message << "closeAllFiles() finished"; - --closingAllFiles; -} - -/*static*/ int MongoFile::flushAll(OperationContext* opCtx, bool sync) { - return _flushAll(opCtx, sync); -} - -/*static*/ int MongoFile::_flushAll(OperationContext* opCtx, bool sync) { - if (!sync) { - int num = 0; - LockMongoFilesShared lk(opCtx); - for (set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++) { - num++; - MongoFile* mmf = *i; - if (!mmf) - continue; - - invariant(!mmf->isOptionSet(READONLY)); - mmf->flush(sync); - } - return num; - } - - // want to do it sync - - // get a thread-safe Flushable object for each file first in a single lock - // so that we can iterate and flush without doing any locking here - OwnedPointerVector<Flushable> thingsToFlushWrapper; - vector<Flushable*>& thingsToFlush = thingsToFlushWrapper.mutableVector(); - { - LockMongoFilesShared lk(opCtx); - for (set<MongoFile*>::iterator i = mmfiles.begin(); i != mmfiles.end(); i++) { - MongoFile* mmf = *i; - if (!mmf) - continue; - thingsToFlush.push_back(mmf->prepareFlush()); - } - } - - for (size_t i = 0; i < thingsToFlush.size(); i++) { - thingsToFlush[i]->flush(opCtx); - } - - return thingsToFlush.size(); -} - -void MongoFile::created(OperationContext* opCtx) { - // If we're a READONLY mapping, we don't want to ever flush. - if (!isOptionSet(READONLY)) { - LockMongoFilesExclusive lk(opCtx); - mmfiles.insert(this); - } -} - -void MongoFile::setFilename(OperationContext* opCtx, const std::string& fn) { - LockMongoFilesExclusive lk(opCtx); - verify(_filename.empty()); - _filename = boost::filesystem::absolute(fn).generic_string(); - MongoFile*& ptf = pathToFile[_filename]; - massert(13617, "MongoFile : multiple opens of same filename", ptf == 0); - ptf = this; -} - -MongoFile* MongoFileFinder::findByPath(const std::string& path) const { - return mapFindWithDefault(pathToFile, - boost::filesystem::absolute(path).generic_string(), - static_cast<MongoFile*>(NULL)); -} - -void dataSyncFailedHandler() { - log() << "error syncing data to disk, probably a disk error"; - log() << " shutting down immediately to avoid corruption"; - fassertFailed(17346); -} - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/mmap.h b/src/mongo/db/storage/mmap_v1/mmap.h deleted file mode 100644 index 37a2e9e6fcd..00000000000 --- a/src/mongo/db/storage/mmap_v1/mmap.h +++ /dev/null @@ -1,325 +0,0 @@ -/* Copyright 2009 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects - * for all of the code used other than as permitted herein. If you modify - * file(s) with this exception, you may extend this exception to your - * version of the file(s), but you are not obligated to do so. If you do not - * wish to do so, delete this exception statement from your version. If you - * delete this exception statement from all source files in the program, - * then also delete it in the license file. - */ - -#pragma once - -#include <set> -#include <sstream> -#include <vector> - -#include "mongo/base/disallow_copying.h" -#include "mongo/db/client.h" -#include "mongo/db/concurrency/d_concurrency.h" -#include "mongo/db/operation_context.h" - -namespace mongo { - -#if !defined(_WIN32) -typedef int HANDLE; -#endif - -extern std::size_t getMinOSPageSizeBytes(); -void minOSPageSizeBytesTest(size_t minOSPageSizeBytes); // lame-o - -// call this if syncing data fails -void dataSyncFailedHandler(); - -class MAdvise { - MONGO_DISALLOW_COPYING(MAdvise); - -public: - enum Advice { Sequential = 1, Random = 2 }; - MAdvise(void* p, unsigned len, Advice a); - ~MAdvise(); // destructor resets the range to MADV_NORMAL -private: - void* _p; - unsigned _len; -}; - -// lock order: lock dbMutex before this if you lock both -class LockMongoFilesShared { - friend class LockMongoFilesExclusive; - static Lock::ResourceMutex mmmutex; - static unsigned era; - - Lock::SharedLock lk; - -public: - explicit LockMongoFilesShared(OperationContext* opCtx) : lk(opCtx->lockState(), mmmutex) { - // JS worker threads may not have cc() setup, as they work on behalf of other clients - dassert(opCtx == cc().getOperationContext() || !cc().getOperationContext()); - } - - static void assertExclusivelyLocked(OperationContext* opCtx) { - invariant(mmmutex.isExclusivelyLocked(opCtx->lockState())); - } - - static void assertAtLeastReadLocked(OperationContext* opCtx) { - invariant(mmmutex.isAtLeastReadLocked(opCtx->lockState())); - } - - /** era changes anytime memory maps come and go. thus you can use this as a cheap way to check - if nothing has changed since the last time you locked. Of course you must be shared locked - at the time of this call, otherwise someone could be in progress. - - This is used for yielding; see PageFaultException::touch(). - */ - static unsigned getEra() { - return era; - } -}; - -class LockMongoFilesExclusive { - Lock::ExclusiveLock lk; - -public: - explicit LockMongoFilesExclusive(OperationContext* opCtx) - : lk(opCtx->lockState(), LockMongoFilesShared::mmmutex) { - // JS worker threads may not have cc() setup, as they work on behalf of other clients - dassert(opCtx == cc().getOperationContext() || !cc().getOperationContext()); - LockMongoFilesShared::era++; - } -}; - -/* the administrative-ish stuff here */ -class MongoFile { - MONGO_DISALLOW_COPYING(MongoFile); - -public: - /** Flushable has to fail nicely if the underlying object gets killed */ - class Flushable { - public: - virtual ~Flushable() {} - virtual void flush(OperationContext* opCtx) = 0; - }; - - enum Options { - NONE = 0, - SEQUENTIAL = 1 << 0, // hint - e.g. FILE_FLAG_SEQUENTIAL_SCAN on windows. - READONLY = 1 << 1 // if true, writing to the mapped file will crash the process. - }; - - // Integral type used as a BitSet of Options. - using OptionSet = std::underlying_type<Options>::type; - - MongoFile(OptionSet options); - virtual ~MongoFile() = default; - - /** @param fun is called for each MongoFile. - called from within a mutex that MongoFile uses. so be careful not to deadlock. - */ - template <class F> - static void forEach(OperationContext* opCtx, F fun); - - /** - * note: you need to be in mmmutex when using this. forEach (above) handles that for you - * automatically. - */ - static std::set<MongoFile*>& getAllFiles(); - - static int flushAll(OperationContext* opCtx, bool sync); // returns n flushed - static void closeAllFiles(OperationContext* opCtx, std::stringstream& message); - - virtual bool isDurableMappedFile() { - return false; - } - - std::string filename() const { - return _filename; - } - void setFilename(OperationContext* opCtx, const std::string& fn); - - virtual uint64_t getUniqueId() const = 0; - -private: - std::string _filename; - static int _flushAll(OperationContext* opCtx, bool sync); // returns n flushed - const OptionSet _options; - -protected: - /** - * Implementations may assume this is called from within `LockMongoFilesExclusive`. - */ - virtual void close(OperationContext* opCtx) = 0; - virtual void flush(bool sync) = 0; - /** - * returns a thread safe object that you can call flush on - * Flushable has to fail nicely if the underlying object gets killed - */ - virtual Flushable* prepareFlush() = 0; - - /** - * Returns true iff the file is closed. - */ - virtual bool isClosed() = 0; - - void created(OperationContext* opCtx); /* subclass must call after create */ - - /** - * Implementations may assume this is called from within `LockMongoFilesExclusive`. - * - * subclass must call in destructor (or at close). - * removes this from pathToFile and other maps - * safe to call more than once, albeit might be wasted work - * ideal to call close to the close, if the close is well before object destruction - */ - void destroyed(OperationContext* opCtx); - - virtual unsigned long long length() const = 0; - - bool isOptionSet(Options option) const { - return _options & option; - } -}; - -/** look up a MMF by filename. scoped mutex locking convention. - example: - MMFFinderByName finder; - DurableMappedFile *a = finder.find("file_name_a"); - DurableMappedFile *b = finder.find("file_name_b"); -*/ -class MongoFileFinder { - MONGO_DISALLOW_COPYING(MongoFileFinder); - -public: - MongoFileFinder(OperationContext* opCtx) : _lk(opCtx) {} - - /** @return The MongoFile object associated with the specified file name. If no file is open - with the specified name, returns null. - */ - MongoFile* findByPath(const std::string& path) const; - -private: - LockMongoFilesShared _lk; -}; - -class MemoryMappedFile : public MongoFile { -protected: - virtual void* viewForFlushing() { - if (views.size() == 0) - return 0; - verify(views.size() == 1); - return views[0]; - } - -public: - MemoryMappedFile(OperationContext* opCtx, OptionSet options = NONE); - - virtual ~MemoryMappedFile(); - - /** - * Callers must be holding a `LockMongoFilesExclusive`. - */ - virtual void close(OperationContext* opCtx); - - /** - * uasserts if file doesn't exist. fasserts on mmap error. - */ - void* map(OperationContext* opCtx, const char* filename); - - /** - * uasserts if file exists. fasserts on mmap error. - * @param zero fill file with zeros when true - */ - void* create(OperationContext* opCtx, - const std::string& filename, - unsigned long long len, - bool zero); - - void flush(bool sync); - - virtual bool isClosed(); - - virtual Flushable* prepareFlush(); - - long shortLength() const { - return (long)len; - } - unsigned long long length() const { - return len; - } - HANDLE getFd() const { - return fd; - } - - /** - * Creates a new view with the specified properties. Automatically cleaned up upon - * close/destruction of the MemoryMappedFile object. Returns nullptr on mmap error. - */ - void* createPrivateMap(); - - virtual uint64_t getUniqueId() const { - return _uniqueId; - } - - static int totalMappedLengthInMB() { - return static_cast<int>(totalMappedLength.load() / 1024 / 1024); - } - -private: - static void updateLength(const char* filename, unsigned long long& length); - - HANDLE fd = 0; - HANDLE maphandle = 0; - std::vector<void*> views; - unsigned long long len = 0u; - static AtomicUInt64 totalMappedLength; - const uint64_t _uniqueId; -#ifdef _WIN32 - // flush Mutex - // - // Protects: - // Prevent flush() and close() from concurrently running. - // It ensures close() cannot complete while flush() is running - // Lock Ordering: - // LockMongoFilesShared must be taken before _flushMutex if both are taken - stdx::mutex _flushMutex; -#endif - -protected: - /** - * Creates with length if DNE, otherwise validates input length. Returns nullptr on mmap - * error. - */ - void* map(OperationContext* opCtx, const char* filename, unsigned long long& length); - - /** - * Close the current private view and open a new replacement. Returns nullptr on mmap error. - */ - void* remapPrivateView(OperationContext* opCtx, void* oldPrivateAddr); -}; - -/** p is called from within a mutex that MongoFile uses. so be careful not to deadlock. */ -template <class F> -inline void MongoFile::forEach(OperationContext* opCtx, F p) { - LockMongoFilesShared lklk(opCtx); - const std::set<MongoFile*>& mmfiles = MongoFile::getAllFiles(); - for (std::set<MongoFile*>::const_iterator i = mmfiles.begin(); i != mmfiles.end(); i++) - p(*i); -} - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/mmap_posix.cpp b/src/mongo/db/storage/mmap_v1/mmap_posix.cpp deleted file mode 100644 index b4f96412d9a..00000000000 --- a/src/mongo/db/storage/mmap_v1/mmap_posix.cpp +++ /dev/null @@ -1,333 +0,0 @@ -/* Copyright 2009 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects - * for all of the code used other than as permitted herein. If you modify - * file(s) with this exception, you may extend this exception to your - * version of the file(s), but you are not obligated to do so. If you do not - * wish to do so, delete this exception statement from your version. If you - * delete this exception statement from all source files in the program, - * then also delete it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kControl - -#include "mongo/platform/basic.h" - -#include <errno.h> -#include <fcntl.h> -#include <sys/mman.h> -#include <sys/stat.h> -#include <sys/types.h> - -#include "mongo/db/concurrency/d_concurrency.h" -#include "mongo/db/storage/mmap_v1/file_allocator.h" -#include "mongo/db/storage/mmap_v1/mmap.h" -#include "mongo/platform/atomic_word.h" -#include "mongo/util/log.h" -#include "mongo/util/mongoutils/str.h" -#include "mongo/util/processinfo.h" -#include "mongo/util/startup_test.h" - -using std::endl; -using std::numeric_limits; -using std::vector; - -using namespace mongoutils; - -namespace mongo { - -namespace { -void printMemInfo() { - LogstreamBuilder out = log(); - out << "mem info: "; - - ProcessInfo pi; - if (!pi.supported()) { - out << " not supported"; - return; - } - - out << "vsize: " << pi.getVirtualMemorySize() << " resident: " << pi.getResidentSize() - << " mapped: " << MemoryMappedFile::totalMappedLengthInMB(); -} -} // namespace -} // namespace mongo - -std::size_t mongo::getMinOSPageSizeBytes() { - static const std::size_t cachedSize = [] { - std::size_t minOSPageSizeBytes = sysconf(_SC_PAGESIZE); - minOSPageSizeBytesTest(minOSPageSizeBytes); - return minOSPageSizeBytes; - }(); - return cachedSize; -} - -namespace mongo { - -void MemoryMappedFile::close(OperationContext* opCtx) { - for (vector<void*>::iterator i = views.begin(); i != views.end(); i++) { - munmap(*i, len); - } - views.clear(); - totalMappedLength.fetchAndSubtract(len); - len = 0; - - if (fd) { - ::close(fd); - fd = 0; - } - destroyed(opCtx); // cleans up from the master list of mmaps -} - -#ifndef O_NOATIME -#define O_NOATIME (0) -#endif - -#ifndef MAP_NORESERVE -#define MAP_NORESERVE (0) -#endif - -namespace { -void* _pageAlign(void* p) { - return (void*)((int64_t)p & ~(getMinOSPageSizeBytes() - 1)); -} - -class PageAlignTest : public StartupTest { -public: - void run() { - { - int64_t x = getMinOSPageSizeBytes() + 123; - void* y = _pageAlign(reinterpret_cast<void*>(x)); - invariant(getMinOSPageSizeBytes() == reinterpret_cast<size_t>(y)); - } - { - int64_t a = static_cast<uint64_t>(numeric_limits<int>::max()); - a = a / getMinOSPageSizeBytes(); - a = a * getMinOSPageSizeBytes(); - // a should now be page aligned - - // b is not page aligned - int64_t b = a + 123; - - void* y = _pageAlign(reinterpret_cast<void*>(b)); - invariant(a == reinterpret_cast<int64_t>(y)); - } - } -} pageAlignTest; -} - -#if defined(__sun) -MAdvise::MAdvise(void*, unsigned, Advice) {} -MAdvise::~MAdvise() {} -#else -MAdvise::MAdvise(void* p, unsigned len, Advice a) { - _p = _pageAlign(p); - - _len = len + static_cast<unsigned>(reinterpret_cast<size_t>(p) - reinterpret_cast<size_t>(_p)); - - int advice = 0; - switch (a) { - case Sequential: - advice = MADV_SEQUENTIAL; - break; - case Random: - advice = MADV_RANDOM; - break; - } - - if (madvise(_p, _len, advice)) { - error() << "madvise failed: " << errnoWithDescription(); - } -} -MAdvise::~MAdvise() { - madvise(_p, _len, MADV_NORMAL); -} -#endif - -void* MemoryMappedFile::map(OperationContext* opCtx, - const char* filename, - unsigned long long& length) { - // length may be updated by callee. - setFilename(opCtx, filename); - FileAllocator::get()->allocateAsap(filename, length); - - const bool readOnly = isOptionSet(READONLY); - - massert( - 10446, str::stream() << "mmap: can't map area of size 0 file: " << filename, length > 0); - - const int posixOpenOpts = O_NOATIME | (readOnly ? O_RDONLY : O_RDWR); - fd = ::open(filename, posixOpenOpts); - if (fd <= 0) { - severe() << "couldn't open " << filename << ' ' << errnoWithDescription() << endl; - fd = 0; // our sentinel for not opened - return 0; - } - - unsigned long long filelen = lseek(fd, 0, SEEK_END); - if (filelen != length) { - severe() << "map file alloc failed, wanted: " << length << " filelen: " << filelen << ' ' - << sizeof(size_t); - fassertFailed(16330); - } - lseek(fd, 0, SEEK_SET); - - const int mmapProtectionOpts = readOnly ? PROT_READ : (PROT_READ | PROT_WRITE); - void* view = mmap(NULL, length, mmapProtectionOpts, MAP_SHARED, fd, 0); - if (view == MAP_FAILED) { - severe() << " mmap() failed for " << filename << " len:" << length << " " - << errnoWithDescription() << endl; - if (errno == ENOMEM) { - if (sizeof(void*) == 4) - severe() << "mmap failed with out of memory. You are using a 32-bit build and " - "probably need to upgrade to 64" - << endl; - else - severe() << "mmap failed with out of memory. (64 bit build)" << endl; - } - return 0; - } - - -#if !defined(__sun) - if (isOptionSet(SEQUENTIAL)) { - if (madvise(view, length, MADV_SEQUENTIAL)) { - warning() << "map: madvise failed for " << filename << ' ' << errnoWithDescription() - << endl; - } - } -#endif - - // MemoryMappedFile successfully created, now update state. - len = length; - MemoryMappedFile::totalMappedLength.fetchAndAdd(len); - - views.push_back(view); - - return view; -} - -void* MemoryMappedFile::createPrivateMap() { - void* x = mmap(/*start*/ 0, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_NORESERVE, fd, 0); - if (x == MAP_FAILED) { - if (errno == ENOMEM) { - if (sizeof(void*) == 4) { - severe() << "mmap private failed with out of memory. You are using a 32-bit build " - "and probably need to upgrade to 64" - << endl; - } else { - severe() << "mmap private failed with out of memory. (64 bit build)" << endl; - } - } else { - severe() << "mmap private failed " << errnoWithDescription() << endl; - } - return 0; - } - - views.push_back(x); - return x; -} - -void* MemoryMappedFile::remapPrivateView(OperationContext* opCtx, void* oldPrivateAddr) { -#if defined(__sun) // SERVER-8795 - LockMongoFilesExclusive lockMongoFiles(opCtx); -#endif - - // don't unmap, just mmap over the old region - void* x = mmap(oldPrivateAddr, - len, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_NORESERVE | MAP_FIXED, - fd, - 0); - if (x == MAP_FAILED) { - int err = errno; - severe() << "13601 Couldn't remap private view: " << errnoWithDescription(err) << endl; - printMemInfo(); - abort(); - } - verify(x == oldPrivateAddr); - return x; -} - -void MemoryMappedFile::flush(bool sync) { - if (views.empty() || fd == 0 || !sync) - return; - - bool useFsync = !ProcessInfo::preferMsyncOverFSync(); - - if (useFsync ? fsync(fd) != 0 : msync(viewForFlushing(), len, MS_SYNC) != 0) { - // msync failed, this is very bad - log() << (useFsync ? "fsync failed: " : "msync failed: ") << errnoWithDescription() - << " file: " << filename() << endl; - dataSyncFailedHandler(); - } -} - -bool MemoryMappedFile::isClosed() { - return !len && !fd && !views.size(); -} - -class PosixFlushable : public MemoryMappedFile::Flushable { -public: - PosixFlushable(MemoryMappedFile* theFile, void* view, HANDLE fd, long len) - : _theFile(theFile), _view(view), _fd(fd), _len(len), _id(_theFile->getUniqueId()) {} - - void flush(OperationContext* opCtx) { - if (_view == NULL || _fd == 0) - return; - - if (ProcessInfo::preferMsyncOverFSync() ? msync(_view, _len, MS_SYNC) == 0 - : fsync(_fd) == 0) { - return; - } - - if (errno == EBADF) { - // ok, we were unlocked, so this file was closed - return; - } - - // some error, lets see if we're supposed to exist - LockMongoFilesShared mmfilesLock(opCtx); - std::set<MongoFile*> mmfs = MongoFile::getAllFiles(); - std::set<MongoFile*>::const_iterator it = mmfs.find(_theFile); - if ((it == mmfs.end()) || ((*it)->getUniqueId() != _id)) { - log() << "msync failed with: " << errnoWithDescription() - << " but file doesn't exist anymore, so ignoring"; - // this was deleted while we were unlocked - return; - } - - // we got an error, and we still exist, so this is bad, we fail - log() << "msync " << errnoWithDescription() << endl; - dataSyncFailedHandler(); - } - - MemoryMappedFile* _theFile; - void* _view; - HANDLE _fd; - long _len; - const uint64_t _id; -}; - -MemoryMappedFile::Flushable* MemoryMappedFile::prepareFlush() { - return new PosixFlushable(this, viewForFlushing(), fd, len); -} - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp deleted file mode 100644 index 369681a8298..00000000000 --- a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.cpp +++ /dev/null @@ -1,915 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h" - -#include <utility> - -#include "mongo/db/catalog/database.h" -#include "mongo/db/catalog/database_holder.h" -#include "mongo/db/catalog/index_catalog_entry.h" -#include "mongo/db/index/2d_access_method.h" -#include "mongo/db/index/btree_access_method.h" -#include "mongo/db/index/fts_access_method.h" -#include "mongo/db/index/hash_access_method.h" -#include "mongo/db/index/haystack_access_method.h" -#include "mongo/db/index/index_access_method.h" -#include "mongo/db/index/s2_access_method.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/record_id.h" -#include "mongo/db/server_parameters.h" -#include "mongo/db/storage/mmap_v1/btree/btree_interface.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h" -#include "mongo/db/storage/mmap_v1/data_file.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h" -#include "mongo/db/storage/record_data.h" -#include "mongo/util/log.h" -#include "mongo/util/scopeguard.h" - -namespace mongo { - -using std::unique_ptr; - -namespace { - -/** - * Declaration for the "newCollectionsUsePowerOf2Sizes" server parameter, which is now - * deprecated in 3.0. - * Note that: - * - setting to true performs a no-op. - * - setting to false will fail. - */ -// Unused, needed for server parameter. -AtomicBool newCollectionsUsePowerOf2SizesFlag(true); - -class NewCollectionsUsePowerOf2SizesParameter - : public ExportedServerParameter<bool, ServerParameterType::kStartupAndRuntime> { -public: - NewCollectionsUsePowerOf2SizesParameter() - : ExportedServerParameter<bool, ServerParameterType::kStartupAndRuntime>( - ServerParameterSet::getGlobal(), - "newCollectionsUsePowerOf2Sizes", - &newCollectionsUsePowerOf2SizesFlag) {} - - virtual Status validate(const bool& potentialNewValue) { - if (!potentialNewValue) { - return Status(ErrorCodes::BadValue, - "newCollectionsUsePowerOf2Sizes cannot be set to false. " - "Use noPadding instead during createCollection."); - } - - return Status::OK(); - } - -private: -} exportedNewCollectionsUsePowerOf2SizesParameter; - - -int _massageExtentSize(const ExtentManager* em, long long size) { - if (size < em->minSize()) - return em->minSize(); - if (size > em->maxSize()) - return em->maxSize(); - - return static_cast<int>(size); -} - -} // namespace - - -/** - * Registers the insertion of a new entry in the _collections cache with the RecoveryUnit, - * allowing for rollback. - */ -class MMAPV1DatabaseCatalogEntry::EntryInsertion : public RecoveryUnit::Change { -public: - EntryInsertion(StringData ns, MMAPV1DatabaseCatalogEntry* entry) - : _ns(ns.toString()), _entry(entry) {} - - void rollback() { - _entry->_removeFromCache(NULL, _ns); - } - - void commit(boost::optional<Timestamp>) {} - -private: - const std::string _ns; - MMAPV1DatabaseCatalogEntry* const _entry; -}; - -/** - * Registers the removal of an entry from the _collections cache with the RecoveryUnit, - * delaying actual deletion of the information until the change is commited. This allows - * for easy rollback. - */ -class MMAPV1DatabaseCatalogEntry::EntryRemoval : public RecoveryUnit::Change { -public: - // Rollback removing the collection from the cache. Takes ownership of the cachedEntry, - // and will delete it if removal is final. - EntryRemoval(StringData ns, MMAPV1DatabaseCatalogEntry* catalogEntry, Entry* cachedEntry) - : _ns(ns.toString()), _catalogEntry(catalogEntry), _cachedEntry(cachedEntry) {} - - void rollback() { - _catalogEntry->_collections[_ns] = _cachedEntry; - } - - void commit(boost::optional<Timestamp>) { - delete _cachedEntry; - } - -private: - const std::string _ns; - MMAPV1DatabaseCatalogEntry* const _catalogEntry; - Entry* const _cachedEntry; -}; - -MMAPV1DatabaseCatalogEntry::MMAPV1DatabaseCatalogEntry(OperationContext* opCtx, - StringData name, - StringData path, - bool directoryPerDB, - bool transient, - std::unique_ptr<ExtentManager> extentManager) - : DatabaseCatalogEntry(name), - _path(path.toString()), - _namespaceIndex(opCtx, _path, name.toString()), - _extentManager(std::move(extentManager)) { - ScopeGuard onErrorClose = MakeGuard([&] { - _namespaceIndex.close(opCtx); - _extentManager->close(opCtx); - }); - massert(34469, - str::stream() << name << " is not a valid database name", - NamespaceString::validDBName(name)); - invariant(opCtx->lockState()->isDbLockedForMode(name, MODE_X)); - - try { - // First init the .ns file. If this fails, we may leak the .ns file, but this is OK - // because subsequent openDB will go through this code path again. - _namespaceIndex.init(opCtx); - - // Initialize the extent manager. This will create the first data file (.0) if needed - // and if this fails we would leak the .ns file above. Leaking the .ns or .0 file is - // acceptable, because subsequent openDB calls will exercise the code path again. - Status s = _extentManager->init(opCtx); - if (!s.isOK()) { - msgasserted(16966, str::stream() << "_extentManager->init failed: " << s.toString()); - } - - // This is the actual loading of the on-disk structures into cache. - _init(opCtx); - } catch (const DBException& dbe) { - warning() << "database " << path << " " << name - << " could not be opened due to DBException " << dbe.code() << ": " << dbe.what(); - throw; - } catch (const std::exception& e) { - warning() << "database " << path << " " << name << " could not be opened " << e.what(); - throw; - } - - onErrorClose.Dismiss(); -} - -MMAPV1DatabaseCatalogEntry::~MMAPV1DatabaseCatalogEntry() { - for (CollectionMap::const_iterator i = _collections.begin(); i != _collections.end(); ++i) { - delete i->second; - } - _collections.clear(); -} - -intmax_t dbSize(const std::string& database); // from repair_database.cpp - -int64_t MMAPV1DatabaseCatalogEntry::sizeOnDisk(OperationContext* opCtx) const { - return static_cast<int64_t>(dbSize(name())); -} - -void MMAPV1DatabaseCatalogEntry::_removeFromCache(RecoveryUnit* ru, StringData ns) { - CollectionMap::iterator i = _collections.find(ns.toString()); - if (i == _collections.end()) { - return; - } - - // If there is an operation context, register a rollback to restore the cache entry - if (ru) { - ru->registerChange(new EntryRemoval(ns, this, i->second)); - } else { - delete i->second; - } - _collections.erase(i); -} - -Status MMAPV1DatabaseCatalogEntry::dropCollection(OperationContext* opCtx, StringData ns) { - invariant(opCtx->lockState()->isCollectionLockedForMode(ns, MODE_X)); - - NamespaceDetails* details = _namespaceIndex.details(ns); - - if (!details) { - return Status(ErrorCodes::NamespaceNotFound, str::stream() << "ns not found: " << ns); - } - - invariant(details->nIndexes == 0); // TODO: delete instead? - invariant(details->indexBuildsInProgress == 0); // TODO: delete instead? - - _removeNamespaceFromNamespaceCollection(opCtx, ns); - _removeFromCache(opCtx->recoveryUnit(), ns); - - // free extents - if (!details->firstExtent.isNull()) { - _extentManager->freeExtents(opCtx, details->firstExtent, details->lastExtent); - *opCtx->recoveryUnit()->writing(&details->firstExtent) = DiskLoc().setInvalid(); - *opCtx->recoveryUnit()->writing(&details->lastExtent) = DiskLoc().setInvalid(); - } - - // remove from the catalog hashtable - _namespaceIndex.kill_ns(opCtx, ns); - - return Status::OK(); -} - - -Status MMAPV1DatabaseCatalogEntry::renameCollection(OperationContext* opCtx, - StringData fromNS, - StringData toNS, - bool stayTemp) { - Status s = _renameSingleNamespace(opCtx, fromNS, toNS, stayTemp); - if (!s.isOK()) - return s; - - NamespaceDetails* details = _namespaceIndex.details(toNS); - invariant(details); - - RecordStoreV1Base* systemIndexRecordStore = _getIndexRecordStore(); - auto cursor = systemIndexRecordStore->getCursor(opCtx); - while (auto record = cursor->next()) { - BSONObj oldIndexSpec = record->data.releaseToBson(); - if (fromNS != oldIndexSpec["ns"].valuestrsafe()) - continue; - - BSONObj newIndexSpec; - { - BSONObjBuilder b; - BSONObjIterator i(oldIndexSpec); - while (i.more()) { - BSONElement e = i.next(); - if (strcmp(e.fieldName(), "ns") != 0) - b.append(e); - else - b << "ns" << toNS; - } - newIndexSpec = b.obj(); - } - // TODO SERVER-30638: using timestamp 0 for these inserts. - StatusWith<RecordId> newIndexSpecLoc = systemIndexRecordStore->insertRecord( - opCtx, newIndexSpec.objdata(), newIndexSpec.objsize(), Timestamp(), false); - if (!newIndexSpecLoc.isOK()) - return newIndexSpecLoc.getStatus(); - - const std::string& indexName = oldIndexSpec.getStringField("name"); - - { - // Fix the IndexDetails pointer. - int indexI = getCollectionCatalogEntry(toNS)->_findIndexNumber(opCtx, indexName); - - IndexDetails& indexDetails = details->idx(indexI); - *opCtx->recoveryUnit()->writing(&indexDetails.info) = - DiskLoc::fromRecordId(newIndexSpecLoc.getValue()); - } - - { - // Move the underlying namespace. - std::string oldIndexNs = IndexDescriptor::makeIndexNamespace(fromNS, indexName); - std::string newIndexNs = IndexDescriptor::makeIndexNamespace(toNS, indexName); - - Status s = _renameSingleNamespace(opCtx, oldIndexNs, newIndexNs, false); - if (!s.isOK()) - return s; - } - // Invalidate index record for the old collection. - invalidateSystemCollectionRecord( - opCtx, NamespaceString(name(), "system.indexes"), record->id); - - systemIndexRecordStore->deleteRecord(opCtx, record->id); - } - - return Status::OK(); -} - -Status MMAPV1DatabaseCatalogEntry::_renameSingleNamespace(OperationContext* opCtx, - StringData fromNS, - StringData toNS, - bool stayTemp) { - // some sanity checking - NamespaceDetails* fromDetails = _namespaceIndex.details(fromNS); - if (!fromDetails) - return Status(ErrorCodes::NamespaceNotFound, "from namespace doesn't exist"); - - if (_namespaceIndex.details(toNS)) - return Status(ErrorCodes::NamespaceExists, "to namespace already exists"); - - // at this point, we haven't done anything destructive yet - - // ---- - // actually start moving - // ---- - - // this could throw, but if it does we're ok - _namespaceIndex.add_ns(opCtx, toNS, fromDetails); - NamespaceDetails* toDetails = _namespaceIndex.details(toNS); - - try { - toDetails->copyingFrom(opCtx, toNS, _namespaceIndex, fromDetails); // fixes extraOffset - } catch (DBException&) { - // could end up here if .ns is full - if so try to clean up / roll back a little - _namespaceIndex.kill_ns(opCtx, toNS); - throw; - } - - // at this point, code .ns stuff moved - - _namespaceIndex.kill_ns(opCtx, fromNS); - fromDetails = NULL; - - // fix system.namespaces - BSONObj newSpec; - RecordId oldSpecLocation = getCollectionCatalogEntry(fromNS)->getNamespacesRecordId(); - invariant(!oldSpecLocation.isNull()); - { - BSONObj oldSpec = - _getNamespaceRecordStore()->dataFor(opCtx, oldSpecLocation).releaseToBson(); - invariant(!oldSpec.isEmpty()); - - BSONObjBuilder b; - BSONObjIterator i(oldSpec.getObjectField("options")); - while (i.more()) { - BSONElement e = i.next(); - if (strcmp(e.fieldName(), "create") != 0) { - if (stayTemp || (strcmp(e.fieldName(), "temp") != 0)) - b.append(e); - } else { - b << "create" << toNS; - } - } - newSpec = b.obj(); - } - - RecordId rid = - _addNamespaceToNamespaceCollection(opCtx, toNS, newSpec.isEmpty() ? 0 : &newSpec); - - // Invalidate old namespace record - invalidateSystemCollectionRecord( - opCtx, NamespaceString(name(), "system.namespaces"), oldSpecLocation); - - _getNamespaceRecordStore()->deleteRecord(opCtx, oldSpecLocation); - - Entry*& entry = _collections[toNS.toString()]; - invariant(entry == NULL); - opCtx->recoveryUnit()->registerChange(new EntryInsertion(toNS, this)); - entry = new Entry(); - _removeFromCache(opCtx->recoveryUnit(), fromNS); - _insertInCache(opCtx, toNS, rid, entry); - - return Status::OK(); -} - -void MMAPV1DatabaseCatalogEntry::invalidateSystemCollectionRecord( - OperationContext* opCtx, NamespaceString systemCollectionNamespace, RecordId record) { - // Having to go back up through the DatabaseHolder is a bit of a layering - // violation, but at this point we're not going to add more MMAPv1 specific interfaces. - StringData dbName = systemCollectionNamespace.db(); - invariant(opCtx->lockState()->isDbLockedForMode(dbName, MODE_X)); - Database* db = DatabaseHolder::getDatabaseHolder().get(opCtx, dbName); - Collection* systemCollection = db->getCollection(opCtx, systemCollectionNamespace); - systemCollection->getCursorManager()->invalidateDocument(opCtx, record, INVALIDATION_DELETION); -} - -void MMAPV1DatabaseCatalogEntry::appendExtraStats(OperationContext* opCtx, - BSONObjBuilder* output, - double scale) const { - if (isEmpty()) { - output->appendNumber("fileSize", 0); - } else { - output->appendNumber("fileSize", _extentManager->fileSize() / scale); - output->appendNumber("nsSizeMB", - static_cast<int>(_namespaceIndex.fileLength() / (1024 * 1024))); - - int freeListSize = 0; - int64_t freeListSpace = 0; - _extentManager->freeListStats(opCtx, &freeListSize, &freeListSpace); - - BSONObjBuilder extentFreeList(output->subobjStart("extentFreeList")); - extentFreeList.append("num", freeListSize); - extentFreeList.appendNumber("totalSize", static_cast<long long>(freeListSpace / scale)); - extentFreeList.done(); - - { - const DataFileVersion version = _extentManager->getFileFormat(opCtx); - - BSONObjBuilder dataFileVersion(output->subobjStart("dataFileVersion")); - dataFileVersion.append("major", version.majorRaw()); - dataFileVersion.append("minor", version.minorRaw()); - dataFileVersion.done(); - } - } -} - -bool MMAPV1DatabaseCatalogEntry::isOlderThan24(OperationContext* opCtx) const { - if (_extentManager->numFiles() == 0) - return false; - - const DataFileVersion version = _extentManager->getFileFormat(opCtx); - fassert(40109, version.isCompatibleWithCurrentCode()); - - return !version.is24IndexClean(); -} - -void MMAPV1DatabaseCatalogEntry::markIndexSafe24AndUp(OperationContext* opCtx) { - if (_extentManager->numFiles() == 0) - return; - - DataFileVersion version = _extentManager->getFileFormat(opCtx); - fassert(40110, version.isCompatibleWithCurrentCode()); - - if (version.is24IndexClean()) - return; // nothing to do - - version.setIs24IndexClean(); - _extentManager->setFileFormat(opCtx, version); -} - -void MMAPV1DatabaseCatalogEntry::markCollationFeatureAsInUse(OperationContext* opCtx) { - if (_extentManager->numFiles() == 0) - return; - - DataFileVersion version = _extentManager->getFileFormat(opCtx); - fassert(40150, version.isCompatibleWithCurrentCode()); - - if (version.getMayHaveCollationMetadata()) - return; - - version.setMayHaveCollationMetadata(); - _extentManager->setFileFormat(opCtx, version); -} - -Status MMAPV1DatabaseCatalogEntry::currentFilesCompatible(OperationContext* opCtx) const { - if (_extentManager->numFiles() == 0) - return Status::OK(); - - return _extentManager->getOpenFile(0)->getHeader()->version.isCompatibleWithCurrentCode(); -} - -void MMAPV1DatabaseCatalogEntry::getCollectionNamespaces(std::list<std::string>* tofill) const { - _namespaceIndex.getCollectionNamespaces(tofill); -} - -void MMAPV1DatabaseCatalogEntry::_ensureSystemCollection(OperationContext* opCtx, StringData ns) { - NamespaceDetails* details = _namespaceIndex.details(ns); - if (details) { - return; - } - - if (storageGlobalParams.readOnly) { - severe() << "Missing system collection '" << ns << "' for database '" << name() << "'"; - fassertFailed(34372); - } - - _namespaceIndex.add_ns(opCtx, ns, DiskLoc(), false); -} - -void MMAPV1DatabaseCatalogEntry::_init(OperationContext* opCtx) { - // We wrap the WUOW in an optional as we can't create it if we are in RO mode. - boost::optional<WriteUnitOfWork> wunit; - if (!storageGlobalParams.readOnly) { - wunit.emplace(opCtx); - } - - // Upgrade freelist - const NamespaceString oldFreeList(name(), "$freelist"); - NamespaceDetails* freeListDetails = _namespaceIndex.details(oldFreeList.ns()); - if (freeListDetails) { - if (storageGlobalParams.readOnly) { - severe() << "Legacy storage format detected, but server was started with the " - "--queryableBackupMode command line parameter."; - fassertFailedNoTrace(34373); - } - - if (!freeListDetails->firstExtent.isNull()) { - _extentManager->freeExtents( - opCtx, freeListDetails->firstExtent, freeListDetails->lastExtent); - } - - _namespaceIndex.kill_ns(opCtx, oldFreeList.ns()); - } - - DataFileVersion version = _extentManager->getFileFormat(opCtx); - if (version.isCompatibleWithCurrentCode().isOK() && !version.mayHave30Freelist()) { - if (storageGlobalParams.readOnly) { - severe() << "Legacy storage format detected, but server was started with the " - "--queryableBackupMode command line parameter."; - fassertFailedNoTrace(34374); - } - - // Any DB that can be opened and written to gets this flag set. - version.setMayHave30Freelist(); - _extentManager->setFileFormat(opCtx, version); - } - - const NamespaceString nsi(name(), "system.indexes"); - const NamespaceString nsn(name(), "system.namespaces"); - - bool isSystemNamespacesGoingToBeNew = _namespaceIndex.details(nsn.toString()) == NULL; - bool isSystemIndexesGoingToBeNew = _namespaceIndex.details(nsi.toString()) == NULL; - - _ensureSystemCollection(opCtx, nsn.toString()); - _ensureSystemCollection(opCtx, nsi.toString()); - - if (isSystemNamespacesGoingToBeNew) { - invariant(!storageGlobalParams.readOnly); - opCtx->recoveryUnit()->registerChange(new EntryInsertion(nsn.toString(), this)); - } - if (isSystemIndexesGoingToBeNew) { - invariant(!storageGlobalParams.readOnly); - opCtx->recoveryUnit()->registerChange(new EntryInsertion(nsi.toString(), this)); - } - - Entry*& indexEntry = _collections[nsi.toString()]; - Entry*& nsEntry = _collections[nsn.toString()]; - - NamespaceDetails* const indexDetails = _namespaceIndex.details(nsi.toString()); - NamespaceDetails* const nsDetails = _namespaceIndex.details(nsn.toString()); - - // order has to be: - // 1) ns rs - // 2) i rs - // 3) catalog entries - - if (!nsEntry) { - nsEntry = new Entry(); - - NamespaceDetailsRSV1MetaData* md = - new NamespaceDetailsRSV1MetaData(nsn.toString(), nsDetails); - nsEntry->recordStore.reset( - new SimpleRecordStoreV1(opCtx, nsn.toString(), md, _extentManager.get(), false)); - } - - if (!indexEntry) { - indexEntry = new Entry(); - - NamespaceDetailsRSV1MetaData* md = - new NamespaceDetailsRSV1MetaData(nsi.toString(), indexDetails); - - indexEntry->recordStore.reset( - new SimpleRecordStoreV1(opCtx, nsi.toString(), md, _extentManager.get(), true)); - } - - RecordId indexNamespaceId; - if (isSystemIndexesGoingToBeNew) { - indexNamespaceId = _addNamespaceToNamespaceCollection(opCtx, nsi.toString(), NULL); - } - - if (!nsEntry->catalogEntry) { - nsEntry->catalogEntry.reset( - new NamespaceDetailsCollectionCatalogEntry(nsn.toString(), - nsDetails, - nsEntry->recordStore.get(), - RecordId(), - indexEntry->recordStore.get(), - this)); - } - - if (!indexEntry->catalogEntry) { - indexEntry->catalogEntry.reset( - new NamespaceDetailsCollectionCatalogEntry(nsi.toString(), - indexDetails, - nsEntry->recordStore.get(), - indexNamespaceId, - indexEntry->recordStore.get(), - this)); - } - - if (!storageGlobalParams.readOnly) { - wunit->commit(); - } - - // Now put everything in the cache of namespaces. None of the operations below do any - // transactional operations. - RecordStoreV1Base* rs = _getNamespaceRecordStore(); - invariant(rs); - - auto cursor = rs->getCursor(opCtx); - while (auto record = cursor->next()) { - auto ns = record->data.releaseToBson()["name"].String(); - Entry*& entry = _collections[ns]; - - // The two cases where entry is not null is for system.indexes and system.namespaces, - // which we manually instantiated above. It is OK to skip these two collections, - // because they don't have indexes on them anyway. - if (entry) { - if (entry->catalogEntry->getNamespacesRecordId().isNull()) { - entry->catalogEntry->setNamespacesRecordId(opCtx, record->id); - } else { - invariant(entry->catalogEntry->getNamespacesRecordId() == record->id); - } - continue; - } - - entry = new Entry(); - _insertInCache(opCtx, ns, record->id, entry); - } -} - -Status MMAPV1DatabaseCatalogEntry::createCollection(OperationContext* opCtx, - StringData ns, - const CollectionOptions& options, - bool allocateDefaultSpace) { - if (_namespaceIndex.details(ns)) { - return Status(ErrorCodes::NamespaceExists, - str::stream() << "namespace already exists: " << ns); - } - - BSONObj optionsAsBSON = options.toBSON(); - RecordId rid = _addNamespaceToNamespaceCollection(opCtx, ns, &optionsAsBSON); - - _namespaceIndex.add_ns(opCtx, ns, DiskLoc(), options.capped); - NamespaceDetails* details = _namespaceIndex.details(ns); - - // Set the flags. - NamespaceDetailsRSV1MetaData(ns, details).replaceUserFlags(opCtx, options.flags); - - if (options.capped && options.cappedMaxDocs > 0) { - opCtx->recoveryUnit()->writingInt(details->maxDocsInCapped) = options.cappedMaxDocs; - } - - Entry*& entry = _collections[ns.toString()]; - invariant(!entry); - opCtx->recoveryUnit()->registerChange(new EntryInsertion(ns, this)); - entry = new Entry(); - _insertInCache(opCtx, ns, rid, entry); - - if (allocateDefaultSpace) { - RecordStoreV1Base* rs = _getRecordStore(ns); - if (options.initialNumExtents > 0) { - int size = _massageExtentSize(_extentManager.get(), options.cappedSize); - for (int i = 0; i < options.initialNumExtents; i++) { - rs->increaseStorageSize(opCtx, size, false); - } - } else if (!options.initialExtentSizes.empty()) { - for (size_t i = 0; i < options.initialExtentSizes.size(); i++) { - int size = options.initialExtentSizes[i]; - size = _massageExtentSize(_extentManager.get(), size); - rs->increaseStorageSize(opCtx, size, false); - } - } else if (options.capped) { - // normal - do { - // Must do this at least once, otherwise we leave the collection with no - // extents, which is invalid. - int sz = _massageExtentSize(_extentManager.get(), - options.cappedSize - rs->storageSize(opCtx)); - sz &= 0xffffff00; - rs->increaseStorageSize(opCtx, sz, false); - } while (rs->storageSize(opCtx) < options.cappedSize); - } else { - rs->increaseStorageSize(opCtx, _extentManager->initialSize(128), false); - } - } - - if (!options.collation.isEmpty()) { - markCollationFeatureAsInUse(opCtx); - } - - return Status::OK(); -} - -void MMAPV1DatabaseCatalogEntry::createNamespaceForIndex(OperationContext* opCtx, StringData name) { - // This is a simplified form of createCollection. - invariant(!_namespaceIndex.details(name)); - - RecordId rid = _addNamespaceToNamespaceCollection(opCtx, name, NULL); - _namespaceIndex.add_ns(opCtx, name, DiskLoc(), false); - - Entry*& entry = _collections[name.toString()]; - invariant(!entry); - opCtx->recoveryUnit()->registerChange(new EntryInsertion(name, this)); - entry = new Entry(); - _insertInCache(opCtx, name, rid, entry); -} - -NamespaceDetailsCollectionCatalogEntry* MMAPV1DatabaseCatalogEntry::getCollectionCatalogEntry( - StringData ns) const { - CollectionMap::const_iterator i = _collections.find(ns.toString()); - if (i == _collections.end()) { - return NULL; - } - - invariant(i->second->catalogEntry.get()); - return i->second->catalogEntry.get(); -} - -void MMAPV1DatabaseCatalogEntry::_insertInCache(OperationContext* opCtx, - StringData ns, - RecordId rid, - Entry* entry) { - NamespaceDetails* details = _namespaceIndex.details(ns); - invariant(details); - - entry->catalogEntry.reset(new NamespaceDetailsCollectionCatalogEntry( - ns, details, _getNamespaceRecordStore(), rid, _getIndexRecordStore(), this)); - - unique_ptr<NamespaceDetailsRSV1MetaData> md(new NamespaceDetailsRSV1MetaData(ns, details)); - const NamespaceString nss(ns); - - if (details->isCapped) { - entry->recordStore.reset(new CappedRecordStoreV1( - opCtx, NULL, ns, md.release(), _extentManager.get(), nss.coll() == "system.indexes")); - } else { - entry->recordStore.reset(new SimpleRecordStoreV1( - opCtx, ns, md.release(), _extentManager.get(), nss.coll() == "system.indexes")); - } -} - -RecordStore* MMAPV1DatabaseCatalogEntry::getRecordStore(StringData ns) const { - return _getRecordStore(ns); -} - -RecordStoreV1Base* MMAPV1DatabaseCatalogEntry::_getRecordStore(StringData ns) const { - CollectionMap::const_iterator i = _collections.find(ns.toString()); - if (i == _collections.end()) { - return NULL; - } - - invariant(i->second->recordStore.get()); - return i->second->recordStore.get(); -} - -IndexAccessMethod* MMAPV1DatabaseCatalogEntry::getIndex(OperationContext* opCtx, - const CollectionCatalogEntry* collection, - IndexCatalogEntry* entry) { - const std::string& type = entry->descriptor()->getAccessMethodName(); - - std::string ns = collection->ns().ns(); - - RecordStoreV1Base* rs = _getRecordStore(entry->descriptor()->indexNamespace()); - invariant(rs); - - std::unique_ptr<SortedDataInterface> btree( - getMMAPV1Interface(entry->headManager(), - rs, - &rs->savedCursors, - entry->ordering(), - entry->descriptor()->indexNamespace(), - entry->descriptor()->version(), - entry->descriptor()->unique())); - - if (IndexNames::HASHED == type) - return new HashAccessMethod(entry, btree.release()); - - if (IndexNames::GEO_2DSPHERE == type) - return new S2AccessMethod(entry, btree.release()); - - if (IndexNames::TEXT == type) - return new FTSAccessMethod(entry, btree.release()); - - if (IndexNames::GEO_HAYSTACK == type) - return new HaystackAccessMethod(entry, btree.release()); - - if ("" == type) - return new BtreeAccessMethod(entry, btree.release()); - - if (IndexNames::GEO_2D == type) - return new TwoDAccessMethod(entry, btree.release()); - - log() << "Can't find index for keyPattern " << entry->descriptor()->keyPattern(); - fassertFailed(17489); -} - -RecordStoreV1Base* MMAPV1DatabaseCatalogEntry::_getIndexRecordStore() { - const NamespaceString nss(name(), "system.indexes"); - Entry* entry = _collections[nss.toString()]; - invariant(entry); - - return entry->recordStore.get(); -} - -RecordStoreV1Base* MMAPV1DatabaseCatalogEntry::_getNamespaceRecordStore() const { - const NamespaceString nss(name(), "system.namespaces"); - CollectionMap::const_iterator i = _collections.find(nss.toString()); - invariant(i != _collections.end()); - - return i->second->recordStore.get(); -} - -RecordId MMAPV1DatabaseCatalogEntry::_addNamespaceToNamespaceCollection(OperationContext* opCtx, - StringData ns, - const BSONObj* options) { - if (nsToCollectionSubstring(ns) == "system.namespaces") { - // system.namespaces holds all the others, so it is not explicitly listed in the catalog. - return {}; - } - - BSONObjBuilder b; - b.append("name", ns); - if (options && !options->isEmpty()) { - b.append("options", *options); - } - - const BSONObj obj = b.done(); - - RecordStoreV1Base* rs = _getNamespaceRecordStore(); - invariant(rs); - // TODO SERVER-30638: using timestamp 0 for these inserts. - StatusWith<RecordId> loc = - rs->insertRecord(opCtx, obj.objdata(), obj.objsize(), Timestamp(), false); - massertStatusOK(loc.getStatus()); - return loc.getValue(); -} - -void MMAPV1DatabaseCatalogEntry::_removeNamespaceFromNamespaceCollection(OperationContext* opCtx, - StringData ns) { - if (nsToCollectionSubstring(ns) == "system.namespaces") { - // system.namespaces holds all the others, so it is not explicitly listed in the catalog. - return; - } - - auto entry = _collections.find(ns.toString()); - if (entry == _collections.end()) { - return; - } - - RecordStoreV1Base* rs = _getNamespaceRecordStore(); - invariant(rs); - - // Invalidate old namespace record - RecordId oldSpecLocation = entry->second->catalogEntry->getNamespacesRecordId(); - invalidateSystemCollectionRecord( - opCtx, NamespaceString(name(), "system.namespaces"), oldSpecLocation); - - rs->deleteRecord(opCtx, oldSpecLocation); -} - -CollectionOptions MMAPV1DatabaseCatalogEntry::getCollectionOptions(OperationContext* opCtx, - StringData ns) const { - if (nsToCollectionSubstring(ns) == "system.namespaces") { - return {}; - } - - auto entry = _collections.find(ns.toString()); - if (entry == _collections.end()) { - return {}; - } - - return getCollectionOptions(opCtx, entry->second->catalogEntry->getNamespacesRecordId()); -} - -CollectionOptions MMAPV1DatabaseCatalogEntry::getCollectionOptions(OperationContext* opCtx, - RecordId rid) const { - CollectionOptions options; - - if (rid.isNull()) { - return options; - } - - RecordStoreV1Base* rs = _getNamespaceRecordStore(); - invariant(rs); - - RecordData data; - invariant(rs->findRecord(opCtx, rid, &data)); - - if (data.releaseToBson()["options"].isABSONObj()) { - Status status = options.parse(data.releaseToBson()["options"].Obj(), - CollectionOptions::parseForStorage); - fassert(18523, status); - } - return options; -} -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h b/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h deleted file mode 100644 index 67e562d4fe2..00000000000 --- a/src/mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h +++ /dev/null @@ -1,208 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include <map> -#include <string> - -#include "mongo/base/status.h" -#include "mongo/base/string_data.h" -#include "mongo/db/catalog/database_catalog_entry.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace_details_collection_entry.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace_index.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_extent_manager.h" - -namespace mongo { - -class CollectionCatalogEntry; -struct CollectionOptions; -class IndexAccessMethod; -class IndexCatalogEntry; -class IndexDescriptor; -class RecordId; -class RecordStore; -class RecordStoreV1Base; -class RecoveryUnit; -class OperationContext; - -class MMAPV1DatabaseCatalogEntry : public DatabaseCatalogEntry { -public: - MMAPV1DatabaseCatalogEntry(OperationContext* opCtx, - StringData name, - StringData path, - bool directoryperdb, - bool transient, - std::unique_ptr<ExtentManager> extentManager); - - virtual ~MMAPV1DatabaseCatalogEntry(); - - /** - * Must be called before destruction. - */ - virtual void close(OperationContext* opCtx) { - _extentManager->close(opCtx); - _namespaceIndex.close(opCtx); - } - - // these two seem the same and yet different - // TODO(ERH): consolidate into one ideally - virtual bool exists() const { - return _namespaceIndex.pathExists(); - } - virtual bool isEmpty() const { - return !_namespaceIndex.allocated(); - } - virtual bool hasUserData() const { - // The two collections which exist and can't be removed are: - // system.indexes - // system.namespaces - return _collections.size() > 2; - } - - virtual int64_t sizeOnDisk(OperationContext* opCtx) const; - - virtual bool isOlderThan24(OperationContext* opCtx) const; - virtual void markIndexSafe24AndUp(OperationContext* opCtx); - - // Records in the data file version bits that an index or collection may have an associated - // collation. - void markCollationFeatureAsInUse(OperationContext* opCtx); - - virtual Status currentFilesCompatible(OperationContext* opCtx) const; - - virtual void appendExtraStats(OperationContext* opCtx, BSONObjBuilder* out, double scale) const; - - Status createCollection(OperationContext* opCtx, - StringData ns, - const CollectionOptions& options, - bool allocateDefaultSpace); - - Status dropCollection(OperationContext* opCtx, StringData ns); - - Status renameCollection(OperationContext* opCtx, - StringData fromNS, - StringData toNS, - bool stayTemp); - - void getCollectionNamespaces(std::list<std::string>* tofill) const; - - /** - * will return NULL if ns does not exist - */ - NamespaceDetailsCollectionCatalogEntry* getCollectionCatalogEntry(StringData ns) const; - - RecordStore* getRecordStore(StringData ns) const; - - IndexAccessMethod* getIndex(OperationContext* opCtx, - const CollectionCatalogEntry* collection, - IndexCatalogEntry* index); - - const ExtentManager* getExtentManager() const { - return _extentManager.get(); - } - ExtentManager* getExtentManager() { - return _extentManager.get(); - } - - CollectionOptions getCollectionOptions(OperationContext* opCtx, StringData ns) const; - - CollectionOptions getCollectionOptions(OperationContext* opCtx, RecordId nsRid) const; - - /** - * Creates a CollectionCatalogEntry in the form of an index rather than a collection. - * MMAPv1 puts both indexes and collections into CCEs. A namespace named 'name' must not - * exist. - */ - void createNamespaceForIndex(OperationContext* opCtx, StringData name); - static void invalidateSystemCollectionRecord(OperationContext* opCtx, - NamespaceString systemCollectionNamespace, - RecordId record); - -private: - class EntryInsertion; - class EntryRemoval; - - friend class NamespaceDetailsCollectionCatalogEntry; - - // The _collections map is a cache for efficiently looking up namespace information. Access - // to the cache is protected by holding the appropriate DB lock. Regular operations - // (insert/update/delete/query) hold intent locks on the database and they access the cache - // directly. Metadata operations, such as create db/collection, etc acquire exclusive lock - // on the database, which protects against concurrent readers of the cache. - // - // Once initialized, the cache must remain consistent with the data in the memory-mapped - // database files through _removeFromCache and _insertInCache. These methods use the - // RecoveryUnit to ensure correct handling of rollback. - - struct Entry { - std::unique_ptr<NamespaceDetailsCollectionCatalogEntry> catalogEntry; - std::unique_ptr<RecordStoreV1Base> recordStore; - }; - - typedef std::map<std::string, Entry*> CollectionMap; - - - RecordStoreV1Base* _getIndexRecordStore(); - RecordStoreV1Base* _getNamespaceRecordStore() const; - RecordStoreV1Base* _getRecordStore(StringData ns) const; - - RecordId _addNamespaceToNamespaceCollection(OperationContext* opCtx, - StringData ns, - const BSONObj* options); - - void _removeNamespaceFromNamespaceCollection(OperationContext* opCtx, StringData ns); - - Status _renameSingleNamespace(OperationContext* opCtx, - StringData fromNS, - StringData toNS, - bool stayTemp); - - void _ensureSystemCollection(OperationContext* opCtx, StringData ns); - - void _init(OperationContext* opCtx); - - /** - * Populate the _collections cache. - */ - void _insertInCache(OperationContext* opCtx, StringData ns, RecordId rid, Entry* entry); - - /** - * Drop cached information for specified namespace. If a RecoveryUnit is specified, - * use it to allow rollback. When ru is null, removal is unconditional. - */ - void _removeFromCache(RecoveryUnit* ru, StringData ns); - - - const std::string _path; - - NamespaceIndex _namespaceIndex; - std::unique_ptr<ExtentManager> _extentManager; - CollectionMap _collections; -}; -} diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_engine.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_engine.cpp deleted file mode 100644 index 58bb1da6118..00000000000 --- a/src/mongo/db/storage/mmap_v1/mmap_v1_engine.cpp +++ /dev/null @@ -1,420 +0,0 @@ -/** -* Copyright (C) 2014 MongoDB Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/db/storage/mmap_v1/mmap_v1_engine.h" - -#include <boost/filesystem/operations.hpp> -#include <boost/filesystem/path.hpp> -#include <fstream> - -#ifdef __linux__ -#include <sys/sysmacros.h> -#endif - -#include "mongo/db/client.h" -#include "mongo/db/mongod_options.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/storage/mmap_v1/data_file_sync.h" -#include "mongo/db/storage/mmap_v1/dur.h" -#include "mongo/db/storage/mmap_v1/dur_journal.h" -#include "mongo/db/storage/mmap_v1/dur_recover.h" -#include "mongo/db/storage/mmap_v1/dur_recovery_unit.h" -#include "mongo/db/storage/mmap_v1/file_allocator.h" -#include "mongo/db/storage/mmap_v1/mmap.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" -#include "mongo/db/storage/storage_engine_lock_file.h" -#include "mongo/db/storage/storage_options.h" -#include "mongo/util/clock_source.h" -#include "mongo/util/log.h" - - -namespace mongo { - -using std::endl; -using std::ifstream; -using std::string; -using std::stringstream; -using std::vector; - -namespace { - -#if !defined(__sun) -// if doingRepair is true don't consider unclean shutdown an error -void checkForUncleanShutdown(MMAPV1Engine* storageEngine, - bool doingRepair, - const StorageEngineLockFile& lockFile) { - string name = lockFile.getFilespec(); - bool oldFile = lockFile.createdByUncleanShutdown(); - - if (doingRepair) { - // This logic was previously in shared option parsing code. - storageGlobalParams.dur = false; - } - - if (oldFile) { - // we check this here because we want to see if we can get the lock - // if we can't, then its probably just another mongod running - - string errmsg; - if (doingRepair && dur::haveJournalFiles()) { - errmsg = - "************** \n" - "You specified --repair but there are dirty journal files. Please\n" - "restart without --repair to allow the journal files to be replayed.\n" - "If you wish to repair all databases, please shutdown cleanly and\n" - "run with --repair again.\n" - "**************"; - } else if (storageGlobalParams.dur) { - if (!dur::haveJournalFiles(/*anyFiles=*/true)) { - // Passing anyFiles=true as we are trying to protect against starting in an - // unclean state with the journal directory unmounted. If there are any files, - // even prealloc files, then it means that it is mounted so we can continue. - // Previously there was an issue (SERVER-5056) where we would fail to start up - // if killed during prealloc. - - vector<string> dbnames; - storageEngine->listDatabases(&dbnames); - - if (dbnames.size() == 0) { - // this means that mongod crashed - // between initial startup and when journaling was initialized - // it is safe to continue - } else { - errmsg = str::stream() - << "************** \n" - << "old lock file: " << name << ". probably means unclean shutdown,\n" - << "but there are no journal files to recover.\n" - << "this is likely human error or filesystem corruption.\n" - << "please make sure that your journal directory is mounted.\n" - << "found " << dbnames.size() << " dbs.\n" - << "see: http://dochub.mongodb.org/core/repair for more information\n" - << "*************"; - } - } - } else { - if (!dur::haveJournalFiles() && !doingRepair) { - errmsg = str::stream() << "************** \n" - << "Unclean shutdown detected.\n" - << "Please visit http://dochub.mongodb.org/core/repair for " - "recovery instructions.\n" - << "*************"; - } - } - - if (!errmsg.empty()) { - log() << errmsg << endl; - uassert(12596, "old lock file", 0); - } - } - - // Not related to lock file, but this is where we handle unclean shutdown - if (!storageGlobalParams.dur && dur::haveJournalFiles()) { - log() << "**************" << endl; - log() << "Error: journal files are present in journal directory, yet starting without " - "journaling enabled." - << endl; - log() << "It is recommended that you start with journaling enabled so that recovery may " - "occur." - << endl; - log() << "**************" << endl; - uasserted(13597, "can't start without --journal enabled when journal/ files are present"); - } -} -#else -void checkForUncleanShutdown(MMAPV1Engine* storageEngine, - bool doingRepair, - const StorageEngineLockFile& lockFile) { - // TODO - this is very bad that the code above not running here. - - if (doingRepair) { - // This logic was previously in shared option parsing code. - storageGlobalParams.dur = false; - } - - // Not related to lock file, but this is where we handle unclean shutdown - if (!storageGlobalParams.dur && dur::haveJournalFiles()) { - log() << "**************" << endl; - log() << "Error: journal files are present in journal directory, yet starting without " - "--journal enabled." - << endl; - log() << "It is recommended that you start with journaling enabled so that recovery may " - "occur." - << endl; - log() << "Alternatively (not recommended), you can backup everything, then delete the " - "journal files, and run --repair" - << endl; - log() << "**************" << endl; - uasserted(13618, "can't start without --journal enabled when journal/ files are present"); - } -} -#endif // !defined(__sun) - - -/// warn if readahead > 256KB (gridfs chunk size) -void checkReadAhead(const string& dir) { -#ifdef __linux__ - try { - const dev_t dev = getPartition(dir); - - // This path handles the case where the filesystem uses the whole device (including LVM) - string path = str::stream() << "/sys/dev/block/" << major(dev) << ':' << minor(dev) - << "/queue/read_ahead_kb"; - - if (!boost::filesystem::exists(path)) { - // This path handles the case where the filesystem is on a partition. - path = - str::stream() << "/sys/dev/block/" << major(dev) << ':' - << minor(dev) // this is a symlink - << "/.." // parent directory of a partition is for the whole device - << "/queue/read_ahead_kb"; - } - - if (boost::filesystem::exists(path)) { - ifstream file(path.c_str()); - if (file.is_open()) { - int kb; - file >> kb; - if (kb > 256) { - log() << startupWarningsLog; - - log() << "** WARNING: Readahead for " << dir << " is set to " << kb << "KB" - << startupWarningsLog; - - log() << "** We suggest setting it to 256KB (512 sectors) or less" - << startupWarningsLog; - - log() << "** http://dochub.mongodb.org/core/readahead" - << startupWarningsLog; - } - } - } - } catch (const std::exception& e) { - log() << "unable to validate readahead settings due to error: " << e.what() - << startupWarningsLog; - log() << "for more information, see http://dochub.mongodb.org/core/readahead" - << startupWarningsLog; - } -#endif // __linux__ -} - -// This is unrelated to the _tmp directory in dbpath. -void clearTmpFiles() { - boost::filesystem::path path(storageGlobalParams.dbpath); - for (boost::filesystem::directory_iterator i(path); - i != boost::filesystem::directory_iterator(); - ++i) { - string fileName = boost::filesystem::path(*i).leaf().string(); - if (boost::filesystem::is_directory(*i) && fileName.length() && fileName[0] == '$') - boost::filesystem::remove_all(*i); - } -} -} // namespace - -MMAPV1Engine::MMAPV1Engine(const StorageEngineLockFile* lockFile, ClockSource* cs) - : MMAPV1Engine(lockFile, cs, stdx::make_unique<MmapV1ExtentManager::Factory>()) {} - -MMAPV1Engine::MMAPV1Engine(const StorageEngineLockFile* lockFile, - ClockSource* cs, - std::unique_ptr<ExtentManager::Factory> extentManagerFactory) - : _recordAccessTracker(cs), - _extentManagerFactory(std::move(extentManagerFactory)), - _clock(cs), - _startMs(_clock->now().toMillisSinceEpoch()) { - // TODO check non-journal subdirs if using directory-per-db - checkReadAhead(storageGlobalParams.dbpath); - - if (!storageGlobalParams.readOnly) { - invariant(lockFile); - checkForUncleanShutdown(this, storageGlobalParams.repair, *lockFile); - - FileAllocator::get()->start(); - - MONGO_ASSERT_ON_EXCEPTION_WITH_MSG(clearTmpFiles(), "clear tmp files"); - } -} - -void MMAPV1Engine::finishInit() { - dataFileSync.go(); - - // Replays the journal (if needed) and starts the background thread. This requires the - // ability to create OperationContexts. - dur::startup(_clock, _startMs); -} - -MMAPV1Engine::~MMAPV1Engine() { - for (EntryMap::const_iterator it = _entryMap.begin(); it != _entryMap.end(); ++it) { - delete it->second; - } - _entryMap.clear(); -} - -RecoveryUnit* MMAPV1Engine::newRecoveryUnit() { - return new DurRecoveryUnit(); -} - -void MMAPV1Engine::listDatabases(std::vector<std::string>* out) const { - _listDatabases(storageGlobalParams.dbpath, out); -} - -DatabaseCatalogEntry* MMAPV1Engine::getDatabaseCatalogEntry(OperationContext* opCtx, - StringData db) { - { - stdx::lock_guard<stdx::mutex> lk(_entryMapMutex); - EntryMap::const_iterator iter = _entryMap.find(db.toString()); - if (iter != _entryMap.end()) { - return iter->second; - } - } - - // This is an on-demand database create/open. At this point, we are locked under X lock for - // the database (MMAPV1DatabaseCatalogEntry's constructor checks that) so no two threads - // can be creating the same database concurrenty. We need to create the database outside of - // the _entryMapMutex so we do not deadlock (see SERVER-15880). - MMAPV1DatabaseCatalogEntry* entry = new MMAPV1DatabaseCatalogEntry( - opCtx, - db, - storageGlobalParams.dbpath, - storageGlobalParams.directoryperdb, - false, - _extentManagerFactory->create( - db, storageGlobalParams.dbpath, storageGlobalParams.directoryperdb)); - - stdx::lock_guard<stdx::mutex> lk(_entryMapMutex); - - // Sanity check that we are not overwriting something - invariant(_entryMap.insert(EntryMap::value_type(db.toString(), entry)).second); - - return entry; -} - -Status MMAPV1Engine::closeDatabase(OperationContext* opCtx, StringData db) { - // Before the files are closed, flush any potentially outstanding changes, which might - // reference this database. Otherwise we will assert when subsequent applications of the - // global journal entries occur, which happen to have write intents for the removed files. - getDur().syncDataAndTruncateJournal(opCtx); - - stdx::lock_guard<stdx::mutex> lk(_entryMapMutex); - MMAPV1DatabaseCatalogEntry* entry = _entryMap[db.toString()]; - if (entry) { - entry->close(opCtx); - } - delete entry; - _entryMap.erase(db.toString()); - return Status::OK(); -} - -Status MMAPV1Engine::dropDatabase(OperationContext* opCtx, StringData db) { - Status status = closeDatabase(opCtx, db); - if (!status.isOK()) - return status; - - _deleteDataFiles(db.toString()); - - return Status::OK(); -} - -void MMAPV1Engine::_listDatabases(const std::string& directory, std::vector<std::string>* out) { - boost::filesystem::path path(directory); - for (boost::filesystem::directory_iterator i(path); - i != boost::filesystem::directory_iterator(); - ++i) { - if (storageGlobalParams.directoryperdb) { - boost::filesystem::path p = *i; - string dbName = p.leaf().string(); - p /= (dbName + ".ns"); - if (exists(p)) - out->push_back(dbName); - } else { - string fileName = boost::filesystem::path(*i).leaf().string(); - if (fileName.length() > 3 && fileName.substr(fileName.length() - 3, 3) == ".ns") - out->push_back(fileName.substr(0, fileName.length() - 3)); - } - } -} - -int MMAPV1Engine::flushAllFiles(OperationContext* opCtx, bool sync) { - return MongoFile::flushAll(opCtx, sync); -} - -Status MMAPV1Engine::beginBackup(OperationContext* opCtx) { - return Status::OK(); -} - -void MMAPV1Engine::endBackup(OperationContext* opCtx) { - return; -} - -bool MMAPV1Engine::isDurable() const { - return getDur().isDurable(); -} - -bool MMAPV1Engine::isEphemeral() const { - return false; -} - -RecordAccessTracker& MMAPV1Engine::getRecordAccessTracker() { - return _recordAccessTracker; -} - -void MMAPV1Engine::cleanShutdown() { - // wait until file preallocation finishes - // we would only hang here if the file_allocator code generates a - // synchronous signal, which we don't expect - log() << "shutdown: waiting for fs preallocator..." << endl; - auto opCtx = cc().getOperationContext(); - - // In some cases we may shutdown early before we have any operation context yet, but we need - // one for synchronization purposes. - ServiceContext::UniqueOperationContext newTxn; - if (!opCtx) { - newTxn = cc().makeOperationContext(); - opCtx = newTxn.get(); - invariant(opCtx); - } - - FileAllocator::get()->waitUntilFinished(); - - if (storageGlobalParams.dur) { - log() << "shutdown: final commit..." << endl; - - getDur().commitAndStopDurThread(opCtx); - } - - log() << "shutdown: closing all files..." << endl; - stringstream ss3; - MemoryMappedFile::closeAllFiles(opCtx, ss3); - log() << ss3.str() << endl; -} - -void MMAPV1Engine::setJournalListener(JournalListener* jl) { - dur::setJournalListener(jl); -} -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_engine.h b/src/mongo/db/storage/mmap_v1/mmap_v1_engine.h deleted file mode 100644 index 92ab5bfc6f5..00000000000 --- a/src/mongo/db/storage/mmap_v1/mmap_v1_engine.h +++ /dev/null @@ -1,130 +0,0 @@ -// mmap_v1_engine.h - -/** -* Copyright (C) 2014 MongoDB Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include <map> - -#include "mongo/db/storage/mmap_v1/extent_manager.h" -#include "mongo/db/storage/mmap_v1/record_access_tracker.h" -#include "mongo/db/storage/storage_engine.h" -#include "mongo/stdx/mutex.h" - -namespace mongo { - -class ClockSource; -class JournalListener; -class MMAPV1DatabaseCatalogEntry; - -class MMAPV1Engine : public StorageEngine { -public: - MMAPV1Engine(const StorageEngineLockFile* lockFile, ClockSource* cs); - - MMAPV1Engine(const StorageEngineLockFile* lockFile, - ClockSource* cs, - std::unique_ptr<ExtentManager::Factory> extentManagerFactory); - virtual ~MMAPV1Engine(); - - void finishInit(); - - RecoveryUnit* newRecoveryUnit(); - void listDatabases(std::vector<std::string>* out) const; - - int flushAllFiles(OperationContext* opCtx, bool sync); - Status beginBackup(OperationContext* opCtx); - void endBackup(OperationContext* opCtx); - - DatabaseCatalogEntry* getDatabaseCatalogEntry(OperationContext* opCtx, StringData db); - - virtual bool supportsDocLocking() const { - return false; - } - virtual bool isMmapV1() const { - return true; - } - - virtual bool isDurable() const; - - virtual bool isEphemeral() const; - - virtual Status closeDatabase(OperationContext* opCtx, StringData db); - - virtual Status dropDatabase(OperationContext* opCtx, StringData db); - - virtual void cleanShutdown(); - - // Callers should use repairDatabase instead. - virtual Status repairRecordStore(OperationContext* opCtx, const std::string& ns) { - return Status(ErrorCodes::InternalError, "MMAPv1 doesn't support repairRecordStore"); - } - - // MMAPv1 specific (non-virtual) - Status repairDatabase(OperationContext* opCtx, - const std::string& dbName, - bool preserveClonedFilesOnFailure, - bool backupOriginalFiles); - - /** - * Gets a reference to the abstraction used by MMAP v1 to track recently used memory - * addresses. - * - * MMAPv1 specific (non-virtual). This is non-const because callers are allowed to use - * the returned reference to modify the RecordAccessTracker. - * - * The RecordAccessTracker is thread-safe (it uses its own mutex internally). - */ - RecordAccessTracker& getRecordAccessTracker(); - - void setJournalListener(JournalListener* jl) final; - - Timestamp getAllCommittedTimestamp() const override { - MONGO_UNREACHABLE; - } - -private: - static void _listDatabases(const std::string& directory, std::vector<std::string>* out); - - stdx::mutex _entryMapMutex; - typedef std::map<std::string, MMAPV1DatabaseCatalogEntry*> EntryMap; - EntryMap _entryMap; - - // A record access tracker is essentially a large table which tracks recently used - // addresses. It is used when higher layers (e.g. the query system) need to ask - // the storage engine whether data is likely in physical memory. - RecordAccessTracker _recordAccessTracker; - - std::unique_ptr<ExtentManager::Factory> _extentManagerFactory; - - ClockSource* _clock; - int64_t _startMs; -}; - -void _deleteDataFiles(const std::string& database); -} diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_extent_manager.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_extent_manager.cpp deleted file mode 100644 index f3127dc56b0..00000000000 --- a/src/mongo/db/storage/mmap_v1/mmap_v1_extent_manager.cpp +++ /dev/null @@ -1,675 +0,0 @@ -// mmap_v1_extent_manager.cpp - -/** -* Copyright (C) 2014 MongoDB Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include <boost/filesystem/operations.hpp> - -#include "mongo/db/storage/mmap_v1/mmap_v1_extent_manager.h" - -#include "mongo/base/counter.h" -#include "mongo/db/audit.h" -#include "mongo/db/client.h" -#include "mongo/db/concurrency/d_concurrency.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/service_context.h" -#include "mongo/db/storage/mmap_v1/data_file.h" -#include "mongo/db/storage/mmap_v1/dur.h" -#include "mongo/db/storage/mmap_v1/extent.h" -#include "mongo/db/storage/mmap_v1/extent_manager.h" -#include "mongo/db/storage/mmap_v1/mmap.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_engine.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" -#include "mongo/db/storage/mmap_v1/record.h" -#include "mongo/db/storage/record_fetcher.h" -#include "mongo/stdx/memory.h" -#include "mongo/util/fail_point_service.h" -#include "mongo/util/file.h" -#include "mongo/util/log.h" - -namespace mongo { - -using std::unique_ptr; -using std::endl; -using std::max; -using std::string; -using std::stringstream; - -// Turn on this failpoint to force the system to yield for a fetch. Setting to "alwaysOn" -// will cause yields for fetching to occur on every 'kNeedsFetchFailFreq'th call to -// recordNeedsFetch(). -static const int kNeedsFetchFailFreq = 2; -static Counter64 needsFetchFailCounter; -MONGO_FAIL_POINT_DEFINE(recordNeedsFetchFail); - -// Used to make sure the compiler doesn't get too smart on us when we're -// trying to touch records. -// volatile - avoid compiler optimizations for touching a mmap page -volatile int __record_touch_dummy = 1; // NOLINT - -class MmapV1RecordFetcher : public RecordFetcher { - MONGO_DISALLOW_COPYING(MmapV1RecordFetcher); - -public: - explicit MmapV1RecordFetcher(const MmapV1RecordHeader* record) : _record(record) {} - - virtual void setup(OperationContext* opCtx) { - invariant(!_filesLock.get()); - _filesLock.reset(new LockMongoFilesShared(opCtx)); - } - - virtual void fetch() { - // It's only legal to touch the record while we're holding a lock on the data files. - invariant(_filesLock.get()); - - const char* recordChar = reinterpret_cast<const char*>(_record); - - // Here's where we actually deference a pointer into the record. This is where - // we expect a page fault to occur, so we should this out of the lock. - __record_touch_dummy += *recordChar; - - // We're not going to touch the record anymore, so we can give up our - // lock on mongo files. We do this here because we have to release the - // lock on mongo files prior to reacquiring lock mgr locks. - _filesLock.reset(); - } - -private: - // The record which needs to be touched in order to page fault. Not owned by us. - const MmapV1RecordHeader* _record; - - // This ensures that our MmapV1RecordHeader* does not drop out from under our feet before - // we dereference it. - std::unique_ptr<LockMongoFilesShared> _filesLock; -}; - -MmapV1ExtentManager::MmapV1ExtentManager(StringData dbname, StringData path, bool directoryPerDB) - : _dbname(dbname.toString()), - _path(path.toString()), - _directoryPerDB(directoryPerDB), - _rid(RESOURCE_METADATA, dbname) { - StorageEngine* engine = getGlobalServiceContext()->getStorageEngine(); - invariant(engine->isMmapV1()); - MMAPV1Engine* mmapEngine = static_cast<MMAPV1Engine*>(engine); - _recordAccessTracker = &mmapEngine->getRecordAccessTracker(); -} - -std::unique_ptr<ExtentManager> MmapV1ExtentManager::Factory::create(StringData dbname, - StringData path, - bool directoryPerDB) { - return stdx::make_unique<MmapV1ExtentManager>( - std::move(dbname), std::move(path), directoryPerDB); -} - -boost::filesystem::path MmapV1ExtentManager::_fileName(int n) const { - stringstream ss; - ss << _dbname << '.' << n; - boost::filesystem::path fullName(_path); - if (_directoryPerDB) - fullName /= _dbname; - fullName /= ss.str(); - return fullName; -} - - -Status MmapV1ExtentManager::init(OperationContext* opCtx) { - invariant(_files.empty()); - - for (int n = 0; n < DiskLoc::MaxFiles; n++) { - const boost::filesystem::path fullName = _fileName(n); - if (!boost::filesystem::exists(fullName)) { - break; - } - - const std::string fullNameString = fullName.string(); - - { - // If the file is uninitialized we exit the loop because it is just prealloced. We - // do this on a bare File object rather than using the DataFile because closing a - // DataFile triggers dur::closingFileNotification() which is fatal if there are any - // pending writes. Therefore we must only open files that we know we want to keep. - File preview; - preview.open(fullNameString.c_str(), /*readOnly*/ true); - invariant(preview.is_open()); - - // File can't be initialized if too small. - if (preview.len() < sizeof(DataFileHeader)) { - break; - } - - // This is the equivalent of DataFileHeader::uninitialized(). - int version; - preview.read(0, reinterpret_cast<char*>(&version), sizeof(version)); - invariant(!preview.bad()); - if (version == 0) { - break; - } - } - - unique_ptr<DataFile> df(new DataFile(opCtx, n)); - - Status s = df->openExisting(opCtx, fullNameString.c_str()); - if (!s.isOK()) { - df->close(opCtx); - return s; - } - - invariant(!df->getHeader()->uninitialized()); - - // We only checkUpgrade on files that we are keeping, not preallocs. - df->getHeader()->checkUpgrade(opCtx); - - _files.push_back(df.release()); - } - - // If this is a new database being created, instantiate the first file and one extent so - // we can have a coherent database. - if (_files.empty()) { - WriteUnitOfWork wuow(opCtx); - _createExtent(opCtx, initialSize(128), false); - wuow.commit(); - - // Commit the journal and all changes to disk so that even if exceptions occur during - // subsequent initialization, we won't have uncommited changes during file close. - getDur().commitNow(opCtx); - } - - return Status::OK(); -} - -const DataFile* MmapV1ExtentManager::_getOpenFile(int fileId) const { - invariant(fileId >= 0 && fileId < _files.size(), - str::stream() << "_getOpenFile() invalid file index requested " << fileId); - - return _files[fileId]; -} - -DataFile* MmapV1ExtentManager::_getOpenFile(int fileId) { - invariant(fileId >= 0 && fileId < _files.size(), - str::stream() << "_getOpenFile() invalid file index requested " << fileId); - - return _files[fileId]; -} - -DataFile* MmapV1ExtentManager::_addAFile(OperationContext* opCtx, - int sizeNeeded, - bool preallocateNextFile) { - // Database must be stable and we need to be in some sort of an update operation in order - // to add a new file. - invariant(opCtx->lockState()->isDbLockedForMode(_dbname, MODE_IX)); - - const int allocFileId = _files.size(); - - int minSize = 0; - if (allocFileId > 0) { - // Make the next file at least as large as the previous - minSize = _files[allocFileId - 1]->getHeader()->fileLength; - } - - if (minSize < sizeNeeded + DataFileHeader::HeaderSize) { - minSize = sizeNeeded + DataFileHeader::HeaderSize; - } - - { - unique_ptr<DataFile> allocFile(new DataFile(opCtx, allocFileId)); - const string allocFileName = _fileName(allocFileId).string(); - - Timer t; - - try { - allocFile->open(opCtx, allocFileName.c_str(), minSize, false); - } catch (...) { - allocFile->close(opCtx); - throw; - } - if (t.seconds() > 1) { - log() << "MmapV1ExtentManager took " << t.seconds() - << " seconds to open: " << allocFileName; - } - - // It's all good - _files.push_back(allocFile.release()); - } - - // Preallocate is asynchronous - if (preallocateNextFile) { - unique_ptr<DataFile> nextFile(new DataFile(opCtx, allocFileId + 1)); - const string nextFileName = _fileName(allocFileId + 1).string(); - - try { - nextFile->open(opCtx, nextFileName.c_str(), minSize, false); - } catch (...) { - nextFile->close(opCtx); - throw; - } - } - - // Returns the last file added - return _files[allocFileId]; -} - -int MmapV1ExtentManager::numFiles() const { - return _files.size(); -} - -long long MmapV1ExtentManager::fileSize() const { - long long size = 0; - for (int n = 0; boost::filesystem::exists(_fileName(n)); n++) { - size += boost::filesystem::file_size(_fileName(n)); - } - - return size; -} - -MmapV1RecordHeader* MmapV1ExtentManager::_recordForV1(const DiskLoc& loc) const { - loc.assertOk(); - const DataFile* df = _getOpenFile(loc.a()); - - int ofs = loc.getOfs(); - if (ofs < DataFileHeader::HeaderSize) { - df->badOfs(ofs); // will msgassert - external call to keep out of the normal code path - } - - return reinterpret_cast<MmapV1RecordHeader*>(df->p() + ofs); -} - -MmapV1RecordHeader* MmapV1ExtentManager::recordForV1(const DiskLoc& loc) const { - MmapV1RecordHeader* record = _recordForV1(loc); - _recordAccessTracker->markAccessed(record); - return record; -} - -std::unique_ptr<RecordFetcher> MmapV1ExtentManager::recordNeedsFetch(const DiskLoc& loc) const { - if (loc.isNull()) - return {}; - MmapV1RecordHeader* record = _recordForV1(loc); - - // For testing: if failpoint is enabled we randomly request fetches without - // going to the RecordAccessTracker. - if (MONGO_FAIL_POINT(recordNeedsFetchFail)) { - needsFetchFailCounter.increment(); - if ((needsFetchFailCounter.get() % kNeedsFetchFailFreq) == 0) { - return stdx::make_unique<MmapV1RecordFetcher>(record); - } - } - - if (!_recordAccessTracker->checkAccessedAndMark(record)) { - return stdx::make_unique<MmapV1RecordFetcher>(record); - } - - return {}; -} - -DiskLoc MmapV1ExtentManager::extentLocForV1(const DiskLoc& loc) const { - MmapV1RecordHeader* record = recordForV1(loc); - return DiskLoc(loc.a(), record->extentOfs()); -} - -Extent* MmapV1ExtentManager::extentForV1(const DiskLoc& loc) const { - DiskLoc extentLoc = extentLocForV1(loc); - return getExtent(extentLoc); -} - -Extent* MmapV1ExtentManager::getExtent(const DiskLoc& loc, bool doSanityCheck) const { - loc.assertOk(); - Extent* e = reinterpret_cast<Extent*>(_getOpenFile(loc.a())->p() + loc.getOfs()); - if (doSanityCheck) - e->assertOk(); - - _recordAccessTracker->markAccessed(e); - - return e; -} - -void _checkQuota(bool enforceQuota, int fileNo) { - if (!enforceQuota) - return; - - if (fileNo < mmapv1GlobalOptions.quotaFiles) - return; - - uasserted(12501, "quota exceeded"); -} - -int MmapV1ExtentManager::maxSize() const { - return DataFile::maxSize() - DataFileHeader::HeaderSize - 16; -} - -DiskLoc MmapV1ExtentManager::_createExtentInFile( - OperationContext* opCtx, int fileNo, DataFile* f, int size, bool enforceQuota) { - _checkQuota(enforceQuota, fileNo - 1); - - massert(10358, "bad new extent size", size >= minSize() && size <= maxSize()); - - DiskLoc loc = f->allocExtentArea(opCtx, size); - loc.assertOk(); - - Extent* e = getExtent(loc, false); - verify(e); - - *opCtx->recoveryUnit()->writing(&e->magic) = Extent::extentSignature; - *opCtx->recoveryUnit()->writing(&e->myLoc) = loc; - *opCtx->recoveryUnit()->writing(&e->length) = size; - - return loc; -} - - -DiskLoc MmapV1ExtentManager::_createExtent(OperationContext* opCtx, int size, bool enforceQuota) { - size = quantizeExtentSize(size); - - if (size > maxSize()) - size = maxSize(); - - verify(size < DataFile::maxSize()); - - for (int i = numFiles() - 1; i >= 0; i--) { - DataFile* f = _getOpenFile(i); - invariant(f); - - if (f->getHeader()->unusedLength >= size) { - return _createExtentInFile(opCtx, i, f, size, enforceQuota); - } - } - - _checkQuota(enforceQuota, numFiles()); - - // no space in an existing file - // allocate files until we either get one big enough or hit maxSize - for (int i = 0; i < 8; i++) { - DataFile* f = _addAFile(opCtx, size, false); - - if (f->getHeader()->unusedLength >= size) { - return _createExtentInFile(opCtx, numFiles() - 1, f, size, enforceQuota); - } - } - - // callers don't check for null return code, so assert - msgasserted(14810, "couldn't allocate space for a new extent"); -} - -DiskLoc MmapV1ExtentManager::_allocFromFreeList(OperationContext* opCtx, - int approxSize, - bool capped) { - // setup extent constraints - - int low, high; - if (capped) { - // be strict about the size - low = approxSize; - if (low > 2048) - low -= 256; - high = (int)(approxSize * 1.05) + 256; - } else { - low = (int)(approxSize * 0.8); - high = (int)(approxSize * 1.4); - } - if (high <= 0) { - // overflowed - high = max(approxSize, maxSize()); - } - if (high <= minSize()) { - // the minimum extent size is 4097 - high = minSize() + 1; - } - - // scan free list looking for something suitable - - int n = 0; - Extent* best = 0; - int bestDiff = 0x7fffffff; - { - Timer t; - DiskLoc L = _getFreeListStart(); - while (!L.isNull()) { - Extent* e = getExtent(L); - if (e->length >= low && e->length <= high) { - int diff = abs(e->length - approxSize); - if (diff < bestDiff) { - bestDiff = diff; - best = e; - if (((double)diff) / approxSize < 0.1) { - // close enough - break; - } - if (t.seconds() >= 2) { - // have spent lots of time in write lock, and we are in [low,high], so close - // enough could come into play if extent freelist is very long - break; - } - } else { - OCCASIONALLY { - if (high < 64 * 1024 && t.seconds() >= 2) { - // be less picky if it is taking a long time - high = 64 * 1024; - } - } - } - } - L = e->xnext; - ++n; - } - if (t.seconds() >= 10) { - log() << "warning: slow scan in allocFromFreeList (in write lock)" << endl; - } - } - - if (n > 128) { - LOG(n < 512 ? 1 : 0) << "warning: newExtent " << n << " scanned\n"; - } - - if (!best) - return DiskLoc(); - - // remove from the free list - if (!best->xprev.isNull()) - *opCtx->recoveryUnit()->writing(&getExtent(best->xprev)->xnext) = best->xnext; - if (!best->xnext.isNull()) - *opCtx->recoveryUnit()->writing(&getExtent(best->xnext)->xprev) = best->xprev; - if (_getFreeListStart() == best->myLoc) - _setFreeListStart(opCtx, best->xnext); - if (_getFreeListEnd() == best->myLoc) - _setFreeListEnd(opCtx, best->xprev); - - return best->myLoc; -} - -DiskLoc MmapV1ExtentManager::allocateExtent(OperationContext* opCtx, - bool capped, - int size, - bool enforceQuota) { - Lock::ResourceLock rlk(opCtx->lockState(), _rid, MODE_X); - bool fromFreeList = true; - DiskLoc eloc = _allocFromFreeList(opCtx, size, capped); - if (eloc.isNull()) { - fromFreeList = false; - eloc = _createExtent(opCtx, size, enforceQuota); - } - - invariant(!eloc.isNull()); - invariant(eloc.isValid()); - - LOG(1) << "MmapV1ExtentManager::allocateExtent" - << " desiredSize:" << size << " fromFreeList: " << fromFreeList << " eloc: " << eloc; - - return eloc; -} - -void MmapV1ExtentManager::freeExtent(OperationContext* opCtx, DiskLoc firstExt) { - Lock::ResourceLock rlk(opCtx->lockState(), _rid, MODE_X); - Extent* e = getExtent(firstExt); - opCtx->recoveryUnit()->writing(&e->xnext)->Null(); - opCtx->recoveryUnit()->writing(&e->xprev)->Null(); - opCtx->recoveryUnit()->writing(&e->firstRecord)->Null(); - opCtx->recoveryUnit()->writing(&e->lastRecord)->Null(); - - - if (_getFreeListStart().isNull()) { - _setFreeListStart(opCtx, firstExt); - _setFreeListEnd(opCtx, firstExt); - } else { - DiskLoc a = _getFreeListStart(); - invariant(getExtent(a)->xprev.isNull()); - *opCtx->recoveryUnit()->writing(&getExtent(a)->xprev) = firstExt; - *opCtx->recoveryUnit()->writing(&getExtent(firstExt)->xnext) = a; - _setFreeListStart(opCtx, firstExt); - } -} - -void MmapV1ExtentManager::freeExtents(OperationContext* opCtx, DiskLoc firstExt, DiskLoc lastExt) { - Lock::ResourceLock rlk(opCtx->lockState(), _rid, MODE_X); - - if (firstExt.isNull() && lastExt.isNull()) - return; - - { - verify(!firstExt.isNull() && !lastExt.isNull()); - Extent* f = getExtent(firstExt); - Extent* l = getExtent(lastExt); - verify(f->xprev.isNull()); - verify(l->xnext.isNull()); - verify(f == l || !f->xnext.isNull()); - verify(f == l || !l->xprev.isNull()); - } - - if (_getFreeListStart().isNull()) { - _setFreeListStart(opCtx, firstExt); - _setFreeListEnd(opCtx, lastExt); - } else { - DiskLoc a = _getFreeListStart(); - invariant(getExtent(a)->xprev.isNull()); - *opCtx->recoveryUnit()->writing(&getExtent(a)->xprev) = lastExt; - *opCtx->recoveryUnit()->writing(&getExtent(lastExt)->xnext) = a; - _setFreeListStart(opCtx, firstExt); - } -} - -DiskLoc MmapV1ExtentManager::_getFreeListStart() const { - if (_files.empty()) - return DiskLoc(); - const DataFile* file = _getOpenFile(0); - return file->header()->freeListStart; -} - -DiskLoc MmapV1ExtentManager::_getFreeListEnd() const { - if (_files.empty()) - return DiskLoc(); - const DataFile* file = _getOpenFile(0); - return file->header()->freeListEnd; -} - -void MmapV1ExtentManager::_setFreeListStart(OperationContext* opCtx, DiskLoc loc) { - invariant(!_files.empty()); - DataFile* file = _files[0]; - *opCtx->recoveryUnit()->writing(&file->header()->freeListStart) = loc; -} - -void MmapV1ExtentManager::_setFreeListEnd(OperationContext* opCtx, DiskLoc loc) { - invariant(!_files.empty()); - DataFile* file = _files[0]; - *opCtx->recoveryUnit()->writing(&file->header()->freeListEnd) = loc; -} - -void MmapV1ExtentManager::freeListStats(OperationContext* opCtx, - int* numExtents, - int64_t* totalFreeSizeBytes) const { - Lock::ResourceLock rlk(opCtx->lockState(), _rid, MODE_S); - - invariant(numExtents); - invariant(totalFreeSizeBytes); - - *numExtents = 0; - *totalFreeSizeBytes = 0; - - DiskLoc a = _getFreeListStart(); - while (!a.isNull()) { - Extent* e = getExtent(a); - (*numExtents)++; - (*totalFreeSizeBytes) += e->length; - a = e->xnext; - } -} - - -namespace { -class CacheHintMadvise : public ExtentManager::CacheHint { -public: - CacheHintMadvise(void* p, unsigned len, MAdvise::Advice a) : _advice(p, len, a) {} - -private: - MAdvise _advice; -}; -} - -ExtentManager::CacheHint* MmapV1ExtentManager::cacheHint(const DiskLoc& extentLoc, - const ExtentManager::HintType& hint) { - invariant(hint == Sequential); - Extent* e = getExtent(extentLoc); - return new CacheHintMadvise(reinterpret_cast<void*>(e), e->length, MAdvise::Sequential); -} - -MmapV1ExtentManager::FilesArray::~FilesArray() { - for (int i = 0; i < size(); i++) { - delete _files[i]; - } -} - -void MmapV1ExtentManager::FilesArray::close(OperationContext* opCtx) { - for (int i = 0; i < size(); i++) { - _files[i]->close(opCtx); - } -} - -void MmapV1ExtentManager::FilesArray::push_back(DataFile* val) { - stdx::lock_guard<stdx::mutex> lk(_writersMutex); - const int n = _size.load(); - invariant(n < DiskLoc::MaxFiles); - // Note ordering: _size update must come after updating the _files array - _files[n] = val; - _size.store(n + 1); -} - -DataFileVersion MmapV1ExtentManager::getFileFormat(OperationContext* opCtx) const { - if (numFiles() == 0) - return DataFileVersion(0, 0); - - // We explicitly only look at the first file. - return _getOpenFile(0)->getHeader()->version; -} - -void MmapV1ExtentManager::setFileFormat(OperationContext* opCtx, DataFileVersion newVersion) { - invariant(numFiles() > 0); - - DataFile* df = _getOpenFile(0); - invariant(df); - - *opCtx->recoveryUnit()->writing(&df->getHeader()->version) = newVersion; -} -} diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_extent_manager.h b/src/mongo/db/storage/mmap_v1/mmap_v1_extent_manager.h deleted file mode 100644 index dff9de9efe9..00000000000 --- a/src/mongo/db/storage/mmap_v1/mmap_v1_extent_manager.h +++ /dev/null @@ -1,258 +0,0 @@ -// mmap_v1_extent_manager.h - -/** -* Copyright (C) 2014 MongoDB Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include <string> - -#include <boost/filesystem/path.hpp> - -#include "mongo/base/status.h" -#include "mongo/base/string_data.h" -#include "mongo/db/concurrency/lock_manager_defs.h" -#include "mongo/db/storage/mmap_v1/diskloc.h" -#include "mongo/db/storage/mmap_v1/extent_manager.h" -#include "mongo/db/storage/mmap_v1/record_access_tracker.h" -#include "mongo/platform/atomic_word.h" -#include "mongo/stdx/mutex.h" - -namespace mongo { - -class DataFile; -class DataFileVersion; -class MmapV1RecordHeader; -class OperationContext; - -struct Extent; - -/** - * ExtentManager basics - * - one per database - * - responsible for managing <db>.# files - * - NOT responsible for .ns file - * - gives out extents - * - responsible for figuring out how to get a new extent - * - can use any method it wants to do so - * - this structure is NOT stored on disk - * - this class is thread safe, except as indicated below - * - * Implementation: - * - ExtentManager holds a preallocated list of DataFile - * - files will not be removed from the EM, so _files access can be lock-free - * - extent size and loc are immutable - * - Any non-const public operations on an ExtentManager will acquire an MODE_X lock on its - * RESOURCE_MMAPv1_EXTENT_MANAGER resource from the lock-manager, which will extend life - * to during WriteUnitOfWorks that might need rollback. Private methods will only - * be called from public ones. - */ -class MmapV1ExtentManager : public ExtentManager { - MONGO_DISALLOW_COPYING(MmapV1ExtentManager); - -public: - class Factory : public ExtentManager::Factory { - virtual std::unique_ptr<ExtentManager> create(StringData dbname, - StringData path, - bool directoryPerDB) final; - }; - - /** - * @param freeListDetails this is a reference into the .ns file - * while a bit odd, this is not a layer violation as extents - * are a peer to the .ns file, without any layering - */ - MmapV1ExtentManager(StringData dbname, StringData path, bool directoryPerDB); - - /** - * Must be called before destruction. - */ - void close(OperationContext* opCtx) { - _files.close(opCtx); - } - - /** - * opens all current files, not thread safe - */ - Status init(OperationContext* opCtx); - - int numFiles() const; - long long fileSize() const; - - // must call Extent::reuse on the returned extent - DiskLoc allocateExtent(OperationContext* opCtx, bool capped, int size, bool enforceQuota); - - /** - * firstExt has to be == lastExt or a chain - */ - void freeExtents(OperationContext* opCtx, DiskLoc firstExt, DiskLoc lastExt); - - /** - * frees a single extent - * ignores all fields in the Extent except: magic, myLoc, length - */ - void freeExtent(OperationContext* opCtx, DiskLoc extent); - - - void freeListStats(OperationContext* opCtx, int* numExtents, int64_t* totalFreeSizeBytes) const; - - /** - * @param loc - has to be for a specific MmapV1RecordHeader - * Note(erh): this sadly cannot be removed. - * A MmapV1RecordHeader DiskLoc has an offset from a file, while a RecordStore really wants an - * offset from an extent. This intrinsically links an original record store to the original - * extent manager. - */ - MmapV1RecordHeader* recordForV1(const DiskLoc& loc) const; - - std::unique_ptr<RecordFetcher> recordNeedsFetch(const DiskLoc& loc) const; - - /** - * @param loc - has to be for a specific MmapV1RecordHeader (not an Extent) - * Note(erh) see comment on recordFor - */ - Extent* extentForV1(const DiskLoc& loc) const; - - /** - * @param loc - has to be for a specific MmapV1RecordHeader (not an Extent) - * Note(erh) see comment on recordFor - */ - DiskLoc extentLocForV1(const DiskLoc& loc) const; - - /** - * @param loc - has to be for a specific Extent - */ - Extent* getExtent(const DiskLoc& loc, bool doSanityCheck = true) const; - - /** - * Not thread safe, requires a database exclusive lock - */ - DataFileVersion getFileFormat(OperationContext* opCtx) const final; - void setFileFormat(OperationContext* opCtx, DataFileVersion newVersion) final; - - const DataFile* getOpenFile(int n) const final { - return _getOpenFile(n); - } - - virtual int maxSize() const; - - virtual CacheHint* cacheHint(const DiskLoc& extentLoc, const HintType& hint); - -private: - /** - * will return NULL if nothing suitable in free list - */ - DiskLoc _allocFromFreeList(OperationContext* opCtx, int approxSize, bool capped); - - /* allocate a new Extent, does not check free list - */ - DiskLoc _createExtent(OperationContext* opCtx, int approxSize, bool enforceQuota); - - DataFile* _addAFile(OperationContext* opCtx, int sizeNeeded, bool preallocateNextFile); - - - /** - * Shared record retrieval logic used by the public recordForV1() and likelyInPhysicalMem() - * above. - */ - MmapV1RecordHeader* _recordForV1(const DiskLoc& loc) const; - - DiskLoc _getFreeListStart() const; - DiskLoc _getFreeListEnd() const; - void _setFreeListStart(OperationContext* opCtx, DiskLoc loc); - void _setFreeListEnd(OperationContext* opCtx, DiskLoc loc); - - const DataFile* _getOpenFile(int fileId) const; - DataFile* _getOpenFile(int fileId); - - DiskLoc _createExtentInFile( - OperationContext* opCtx, int fileNo, DataFile* f, int size, bool enforceQuota); - - boost::filesystem::path _fileName(int n) const; - - // ----- - - const std::string _dbname; // i.e. "test" - const std::string _path; // i.e. "/data/db" - const bool _directoryPerDB; - const ResourceId _rid; - - // This reference points into the MMAPv1 engine and is only valid as long as the - // engine is valid. Not owned here. - RecordAccessTracker* _recordAccessTracker; - - /** - * Simple wrapper around an array object to allow append-only modification of the array, - * as well as concurrent read-accesses. This class has a minimal interface to keep - * implementation simple and easy to modify. - */ - class FilesArray { - public: - FilesArray() : _size(0) {} - ~FilesArray(); - - /** - * Must be called before destruction. - */ - void close(OperationContext* opCtx); - - /** - * Returns file at location 'n' in the array, with 'n' less than number of files added. - * Will always return the same pointer for a given file. - */ - DataFile* operator[](int n) const { - invariant(n >= 0 && n < size()); - return _files[n]; - } - - /** - * Returns true iff no files were added - */ - bool empty() const { - return size() == 0; - } - - /** - * Returns number of files added to the array - */ - int size() const { - return _size.load(); - } - - // Appends val to the array, taking ownership of its pointer - void push_back(DataFile* val); - - private: - stdx::mutex _writersMutex; - AtomicInt32 _size; // number of files in the array - DataFile* _files[DiskLoc::MaxFiles]; - }; - - FilesArray _files; -}; -} diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_init.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_init.cpp deleted file mode 100644 index 42ba6cb864c..00000000000 --- a/src/mongo/db/storage/mmap_v1/mmap_v1_init.cpp +++ /dev/null @@ -1,86 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/platform/basic.h" - -#include "mongo/base/init.h" -#include "mongo/bson/bsonobjbuilder.h" -#include "mongo/db/service_context.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_engine.h" -#include "mongo/db/storage/storage_engine_init.h" -#include "mongo/db/storage/storage_engine_metadata.h" -#include "mongo/db/storage/storage_options.h" - -namespace mongo { - -namespace { - -class MMAPV1Factory : public StorageEngine::Factory { -public: - virtual ~MMAPV1Factory() {} - virtual StorageEngine* create(const StorageGlobalParams& params, - const StorageEngineLockFile* lockFile) const { - return new MMAPV1Engine(lockFile, getGlobalServiceContext()->getFastClockSource()); - } - - virtual StringData getCanonicalName() const { - return "mmapv1"; - } - - virtual Status validateMetadata(const StorageEngineMetadata& metadata, - const StorageGlobalParams& params) const { - Status status = - metadata.validateStorageEngineOption("directoryPerDB", params.directoryperdb); - if (!status.isOK()) { - return status; - } - - return Status::OK(); - } - - virtual BSONObj createMetadataOptions(const StorageGlobalParams& params) const { - BSONObjBuilder builder; - builder.appendBool("directoryPerDB", params.directoryperdb); - return builder.obj(); - } - - bool supportsReadOnly() const override { - return true; - } -}; - -} // namespace - -MONGO_INITIALIZER_WITH_PREREQUISITES(MMAPV1EngineInit, ("ServiceContext")) -(InitializerContext* context) { - registerStorageEngine(getGlobalServiceContext(), std::make_unique<MMAPV1Factory>()); - return Status::OK(); -} - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_init_test.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_init_test.cpp deleted file mode 100644 index dff7166e77a..00000000000 --- a/src/mongo/db/storage/mmap_v1/mmap_v1_init_test.cpp +++ /dev/null @@ -1,120 +0,0 @@ -/** - * Copyright (C) 2015 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/platform/basic.h" - - -#include "mongo/db/json.h" -#include "mongo/db/service_context.h" -#include "mongo/db/storage/storage_engine_init.h" -#include "mongo/db/storage/storage_engine_metadata.h" -#include "mongo/db/storage/storage_options.h" -#include "mongo/unittest/unittest.h" -#include "mongo/util/mongoutils/str.h" - -namespace { - -using namespace mongo; - -class MMAPV1FactoryTest : public mongo::unittest::Test { -private: - virtual void setUp() { - ServiceContext* globalEnv = getGlobalServiceContext(); - ASSERT_TRUE(globalEnv); - ASSERT_TRUE(isRegisteredStorageEngine(globalEnv, "mmapv1")); - factory = getFactoryForStorageEngine(globalEnv, "mmapv1"); - ASSERT_TRUE(factory); - } - - virtual void tearDown() { - factory = nullptr; - } - -protected: - const StorageEngine::Factory* factory; -}; - -void _testValidateMetadata(const StorageEngine::Factory* factory, - const BSONObj& metadataOptions, - bool directoryPerDB, - ErrorCodes::Error expectedCode) { - // It is fine to specify an invalid data directory for the metadata - // as long as we do not invoke read() or write(). - StorageEngineMetadata metadata("no_such_directory"); - metadata.setStorageEngineOptions(metadataOptions); - - StorageGlobalParams storageOptions; - storageOptions.directoryperdb = directoryPerDB; - - Status status = factory->validateMetadata(metadata, storageOptions); - if (expectedCode != status.code()) { - FAIL(str::stream() - << "Unexpected StorageEngine::Factory::validateMetadata result. Expected: " - << ErrorCodes::errorString(expectedCode) - << " but got " - << status.toString() - << " instead. metadataOptions: " - << metadataOptions - << "; directoryPerDB: " - << directoryPerDB); - } -} - -// Do not validate fields that are not present in metadata. -TEST_F(MMAPV1FactoryTest, ValidateMetadataEmptyOptions) { - _testValidateMetadata(factory, BSONObj(), false, ErrorCodes::OK); - _testValidateMetadata(factory, BSONObj(), true, ErrorCodes::OK); -} - -TEST_F(MMAPV1FactoryTest, ValidateMetadataDirectoryPerDB) { - _testValidateMetadata( - factory, fromjson("{directoryPerDB: 123}"), false, ErrorCodes::FailedToParse); - _testValidateMetadata(factory, fromjson("{directoryPerDB: false}"), false, ErrorCodes::OK); - _testValidateMetadata( - factory, fromjson("{directoryPerDB: false}"), true, ErrorCodes::InvalidOptions); - _testValidateMetadata( - factory, fromjson("{directoryPerDB: true}"), false, ErrorCodes::InvalidOptions); - _testValidateMetadata(factory, fromjson("{directoryPerDB: true}"), true, ErrorCodes::OK); -} - -void _testCreateMetadataOptions(const StorageEngine::Factory* factory, bool directoryPerDB) { - StorageGlobalParams storageOptions; - storageOptions.directoryperdb = directoryPerDB; - - BSONObj metadataOptions = factory->createMetadataOptions(storageOptions); - BSONElement directoryPerDBElement = metadataOptions.getField("directoryPerDB"); - ASSERT_TRUE(directoryPerDBElement.isBoolean()); - ASSERT_EQUALS(directoryPerDB, directoryPerDBElement.boolean()); -} - -TEST_F(MMAPV1FactoryTest, CreateMetadataOptions) { - _testCreateMetadataOptions(factory, false); - _testCreateMetadataOptions(factory, true); -} - -} // namespace diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_noinit.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_noinit.cpp deleted file mode 100644 index c6c7737ee88..00000000000 --- a/src/mongo/db/storage/mmap_v1/mmap_v1_noinit.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Copyright (C) 2016 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -// Empty file to be used when mmapv1 is not enabled -// diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_options.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_options.cpp deleted file mode 100644 index 87986746d93..00000000000 --- a/src/mongo/db/storage/mmap_v1/mmap_v1_options.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Copyright (C) 2017 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" - -mongo::MMAPV1Options mongo::mmapv1GlobalOptions; diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_options.h b/src/mongo/db/storage/mmap_v1/mmap_v1_options.h deleted file mode 100644 index f5b101f553c..00000000000 --- a/src/mongo/db/storage/mmap_v1/mmap_v1_options.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include <string> - -/* - * This file defines the storage for options that come from the command line related to the - * mmap v1 storage engine. - */ - -namespace mongo { - -struct MMAPV1Options { - MMAPV1Options() - : lenForNewNsFiles(16 * 1024 * 1024), - preallocj(true), - prealloc(false), - quota(false), - quotaFiles(8) {} - - // --nssize - // Specifies the default size for namespace files, which are files that end in .ns. - // Each collection and index counts as a namespace. - unsigned lenForNewNsFiles; - - bool preallocj; // --nopreallocj no preallocation of journal files - bool prealloc; // --noprealloc no preallocation of data files - bool smallfiles; // --smallfiles allocate smaller data files - - // --journalOptions 7 dump journal and terminate without doing anything further - // --journalOptions 4 recover and terminate without listening - enum { // bits to be ORed - JournalDumpJournal = 1, // dump diagnostics on the journal during recovery - JournalScanOnly = 2, // don't do any real work, just scan and dump if dump - // specified - JournalRecoverOnly = 4, // terminate after recovery step - JournalParanoid = 8, // paranoid mode enables extra checks - JournalAlwaysCommit = 16, // do a group commit every time the writelock is released - JournalAlwaysRemap = 32, // remap the private view after every group commit - // (may lag to the next write lock acquisition, - // but will do all files then) - JournalNoCheckSpace = 64 // don't check that there is enough room for journal files - // before startup (for diskfull tests) - }; - int journalOptions; // --journalOptions <n> for debugging - - // --quota - // Enables a maximum limit for the number data files each database can have. - // When running with the --quota option, MongoDB has a maximum of 8 data files - // per database. Adjust the quota with --quotaFiles. - bool quota; - - // --quotaFiles - // Modifies the limit on the number of data files per database. - // --quotaFiles option requires that you set --quota. - int quotaFiles; // --quotaFiles -}; - -extern MMAPV1Options mmapv1GlobalOptions; - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/mmap_v1_record_store_test.cpp b/src/mongo/db/storage/mmap_v1/mmap_v1_record_store_test.cpp deleted file mode 100644 index 3a1e71fad40..00000000000 --- a/src/mongo/db/storage/mmap_v1/mmap_v1_record_store_test.cpp +++ /dev/null @@ -1,105 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/platform/basic.h" - -#include "mongo/base/init.h" -#include "mongo/db/operation_context_noop.h" -#include "mongo/db/storage/mmap_v1/extent.h" -#include "mongo/db/storage/mmap_v1/record.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_base.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h" -#include "mongo/db/storage/record_store_test_harness.h" -#include "mongo/db/storage/recovery_unit_noop.h" -#include "mongo/unittest/unittest.h" - -namespace mongo { -namespace { - -class MyHarnessHelper : public RecordStoreHarnessHelper { -public: - MyHarnessHelper() {} - - virtual std::unique_ptr<RecordStore> newNonCappedRecordStore() { - return newNonCappedRecordStore("a.b"); - } - - virtual std::unique_ptr<RecordStore> newNonCappedRecordStore(const std::string& ns) { - OperationContextNoop opCtx; - auto md = stdx::make_unique<DummyRecordStoreV1MetaData>(false, 0); - md->setUserFlag(&opCtx, CollectionOptions::Flag_NoPadding); - return stdx::make_unique<SimpleRecordStoreV1>(&opCtx, ns, md.release(), &_em, false); - } - - virtual std::unique_ptr<RecordStore> newCappedRecordStore(int64_t cappedMaxSize, - int64_t cappedMaxDocs) { - return newCappedRecordStore("a.b", cappedMaxSize, cappedMaxDocs); - } - - virtual std::unique_ptr<RecordStore> newCappedRecordStore(const std::string& ns, - int64_t cappedMaxSize, - int64_t cappedMaxDocs) { - OperationContextNoop opCtx; - auto md = stdx::make_unique<DummyRecordStoreV1MetaData>(true, 0); - auto md_ptr = md.get(); - std::unique_ptr<RecordStore> rs = - stdx::make_unique<CappedRecordStoreV1>(&opCtx, nullptr, ns, md.release(), &_em, false); - - LocAndSize records[] = {{}}; - LocAndSize drecs[] = {{DiskLoc(0, 1000), 1000}, {}}; - md->setCapExtent(&opCtx, DiskLoc(0, 0)); - md->setCapFirstNewRecord(&opCtx, DiskLoc().setInvalid()); - initializeV1RS(&opCtx, records, drecs, NULL, &_em, md_ptr); - - return rs; - } - - std::unique_ptr<RecoveryUnit> newRecoveryUnit() override { - return stdx::make_unique<RecoveryUnitNoop>(); - } - - bool supportsDocLocking() final { - return false; - } - -private: - DummyExtentManager _em; -}; - -std::unique_ptr<HarnessHelper> makeHarnessHelper() { - return stdx::make_unique<MyHarnessHelper>(); -} - -MONGO_INITIALIZER(RegisterHarnessFactory)(InitializerContext* const) { - mongo::registerHarnessHelperFactory(makeHarnessHelper); - return Status::OK(); -} -} // namespace -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/mmap_windows.cpp b/src/mongo/db/storage/mmap_v1/mmap_windows.cpp deleted file mode 100644 index 8d949a22243..00000000000 --- a/src/mongo/db/storage/mmap_v1/mmap_windows.cpp +++ /dev/null @@ -1,487 +0,0 @@ -/* Copyright 2009 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects - * for all of the code used other than as permitted herein. If you modify - * file(s) with this exception, you may extend this exception to your - * version of the file(s), but you are not obligated to do so. If you do not - * wish to do so, delete this exception statement from your version. If you - * delete this exception statement from all source files in the program, - * then also delete it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kControl - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/mmap.h" - -#include "mongo/db/storage/mmap_v1/durable_mapped_file.h" -#include "mongo/db/storage/mmap_v1/file_allocator.h" -#include "mongo/stdx/mutex.h" -#include "mongo/util/log.h" -#include "mongo/util/processinfo.h" -#include "mongo/util/text.h" -#include "mongo/util/timer.h" - -using std::endl; -using std::string; -using std::vector; - -std::size_t mongo::getMinOSPageSizeBytes() { - static const std::size_t cachedSize = [] { - SYSTEM_INFO si; - GetSystemInfo(&si); - std::size_t minOSPageSizeBytes = si.dwPageSize; - minOSPageSizeBytesTest(minOSPageSizeBytes); - return minOSPageSizeBytes; - }(); - return cachedSize; -} - -namespace mongo { - -// MapViewMutex -// -// Protects: -// 1. Ensures all MapViewOfFile/UnMapViewOfFile operations are serialized to reduce chance of -// "address in use" errors (error code 487) -// - These errors can still occur if the memory is used for other purposes -// (stack storage, heap) -// 2. Prevents calls to VirtualProtect while we remapping files. -// Lock Ordering: -// - If taken, must be after previewViews._m to prevent deadlocks -stdx::mutex mapViewMutex; - -MAdvise::MAdvise(void*, unsigned, Advice) {} -MAdvise::~MAdvise() {} - -const unsigned long long memoryMappedFileLocationFloor = 256LL * 1024LL * 1024LL * 1024LL; -static unsigned long long _nextMemoryMappedFileLocation = memoryMappedFileLocationFloor; - -// nextMemoryMappedFileLocationMutex -// -// Protects: -// Windows 64-bit specific allocation of virtual memory regions for -// placing memory mapped files in memory -// Lock Ordering: -// No restrictions -static SimpleMutex _nextMemoryMappedFileLocationMutex; - -unsigned long long AlignNumber(unsigned long long number, unsigned long long granularity) { - return (number + granularity - 1) & ~(granularity - 1); -} - -static void* getNextMemoryMappedFileLocation(unsigned long long mmfSize) { - if (4 == sizeof(void*)) { - return 0; - } - stdx::lock_guard<SimpleMutex> lk(_nextMemoryMappedFileLocationMutex); - - static unsigned long long granularity = 0; - - if (0 == granularity) { - SYSTEM_INFO systemInfo; - GetSystemInfo(&systemInfo); - granularity = static_cast<unsigned long long>(systemInfo.dwAllocationGranularity); - } - - unsigned long long thisMemoryMappedFileLocation = _nextMemoryMappedFileLocation; - - int current_retry = 1; - - while (true) { - MEMORY_BASIC_INFORMATION memInfo; - - if (VirtualQuery(reinterpret_cast<LPCVOID>(thisMemoryMappedFileLocation), - &memInfo, - sizeof(memInfo)) == 0) { - DWORD gle = GetLastError(); - - // If we exceed the limits of Virtual Memory - // - 8TB before Windows 8.1/2012 R2, 128 TB after - // restart scanning from our memory mapped floor once more - // This is a linear scan of regions, not of every VM page - if (gle == ERROR_INVALID_PARAMETER && current_retry == 1) { - thisMemoryMappedFileLocation = memoryMappedFileLocationFloor; - ++current_retry; - continue; - } - - log() << "VirtualQuery of " << thisMemoryMappedFileLocation << " failed with error " - << errnoWithDescription(gle); - fassertFailed(17484); - } - - // Free memory regions that we can use for memory map files - // 1. Marked MEM_FREE, not MEM_RESERVE - // 2. Marked as PAGE_NOACCESS, not anything else - if (memInfo.Protect == PAGE_NOACCESS && memInfo.State == MEM_FREE && - memInfo.RegionSize > mmfSize) - break; - - // Align the memory location in case RegionSize is not aligned to the OS allocation - // granularity size - thisMemoryMappedFileLocation = AlignNumber( - reinterpret_cast<unsigned long long>(memInfo.BaseAddress) + memInfo.RegionSize, - granularity); - } - - _nextMemoryMappedFileLocation = - thisMemoryMappedFileLocation + AlignNumber(mmfSize, granularity); - - return reinterpret_cast<void*>(static_cast<uintptr_t>(thisMemoryMappedFileLocation)); -} - -void MemoryMappedFile::close(OperationContext* opCtx) { - LockMongoFilesShared::assertExclusivelyLocked(opCtx); - - // Prevent flush and close from concurrently running - stdx::lock_guard<stdx::mutex> lk(_flushMutex); - - { - stdx::lock_guard<stdx::mutex> lk(mapViewMutex); - - for (vector<void*>::iterator i = views.begin(); i != views.end(); i++) { - UnmapViewOfFile(*i); - } - } - - views.clear(); - totalMappedLength.fetchAndSubtract(len); - len = 0; - - if (maphandle) - CloseHandle(maphandle); - maphandle = 0; - if (fd) { - CloseHandle(fd); - fd = 0; - } - - destroyed(opCtx); // cleans up from the master list of mmaps -} - -bool MemoryMappedFile::isClosed() { - return !len && !fd && !views.size(); -} - -void* MemoryMappedFile::map(OperationContext* opCtx, - const char* filenameIn, - unsigned long long& length) { - verify(fd == 0 && len == 0); // can't open more than once - setFilename(opCtx, filenameIn); - FileAllocator::get()->allocateAsap(filenameIn, length); - /* big hack here: Babble uses db names with colons. doesn't seem to work on windows. temporary - * perhaps. */ - char filename[256]; - strncpy(filename, filenameIn, 255); - filename[255] = 0; - { - size_t len = strlen(filename); - for (int i = len - 1; i >= 0; i--) { - if (filename[i] == '/' || filename[i] == '\\') - break; - - if (filename[i] == ':') - filename[i] = '_'; - } - } - - updateLength(filename, length); - - const bool readOnly = isOptionSet(READONLY); - - { - DWORD createOptions = FILE_ATTRIBUTE_NORMAL; - if (isOptionSet(SEQUENTIAL)) - createOptions |= FILE_FLAG_SEQUENTIAL_SCAN; - - DWORD desiredAccess = readOnly ? GENERIC_READ : (GENERIC_READ | GENERIC_WRITE); - DWORD shareMode = readOnly ? FILE_SHARE_READ : (FILE_SHARE_WRITE | FILE_SHARE_READ); - - fd = CreateFileW(toWideString(filename).c_str(), - desiredAccess, // desired access - shareMode, // share mode - NULL, // security - OPEN_ALWAYS, // create disposition - createOptions, // flags - NULL); // hTempl - if (fd == INVALID_HANDLE_VALUE) { - DWORD dosError = GetLastError(); - severe() << "CreateFileW for " << filename << " failed with " - << errnoWithDescription(dosError) << " (file size is " << length << ")" - << " in MemoryMappedFile::map" << endl; - return 0; - } - } - - { - DWORD flProtect = readOnly ? PAGE_READONLY : PAGE_READWRITE; - maphandle = CreateFileMappingW(fd, - NULL, - flProtect, - length >> 32 /*maxsizehigh*/, - (unsigned)length /*maxsizelow*/, - NULL /*lpName*/); - if (maphandle == NULL) { - DWORD dosError = GetLastError(); - severe() << "CreateFileMappingW for " << filename << " failed with " - << errnoWithDescription(dosError) << " (file size is " << length << ")" - << " in MemoryMappedFile::map" << endl; - LockMongoFilesExclusive lock(opCtx); - close(opCtx); - fassertFailed(16225); - } - } - - void* view = 0; - { - stdx::lock_guard<stdx::mutex> lk(mapViewMutex); - DWORD access = readOnly ? FILE_MAP_READ : FILE_MAP_ALL_ACCESS; - - int current_retry = 0; - while (true) { - LPVOID thisAddress = getNextMemoryMappedFileLocation(length); - - view = MapViewOfFileEx(maphandle, // file mapping handle - access, // access - 0, - 0, // file offset, high and low - 0, // bytes to map, 0 == all - thisAddress); // address to place file - - if (view == 0) { - DWORD dosError = GetLastError(); - - ++current_retry; - - // If we failed to allocate a memory mapped file, try again in case we picked - // an address that Windows is also trying to use for some other VM allocations - if (dosError == ERROR_INVALID_ADDRESS && current_retry < 5) { - continue; - } - -#ifndef _WIN64 - // Warn user that if they are running a 32-bit app on 64-bit Windows - if (dosError == ERROR_NOT_ENOUGH_MEMORY) { - BOOL wow64Process; - BOOL retWow64 = IsWow64Process(GetCurrentProcess(), &wow64Process); - if (retWow64 && wow64Process) { - log() << "This is a 32-bit MongoDB binary running on a 64-bit" - " operating system that has run out of virtual memory for" - " databases. Switch to a 64-bit build of MongoDB to open" - " the databases."; - } - } -#endif - - severe() << "MapViewOfFileEx for " << filename << " at address " << thisAddress - << " failed with " << errnoWithDescription(dosError) << " (file size is " - << length << ")" - << " in MemoryMappedFile::map" << endl; - - LockMongoFilesExclusive lock(opCtx); - close(opCtx); - fassertFailed(16166); - } - - break; - } - } - - // MemoryMappedFile successfully created, now update state. - len = length; - totalMappedLength.fetchAndAdd(len); - - views.push_back(view); - - return view; -} - -extern stdx::mutex mapViewMutex; - -void* MemoryMappedFile::createPrivateMap() { - verify(maphandle); - - stdx::lock_guard<stdx::mutex> lk(mapViewMutex); - - LPVOID thisAddress = getNextMemoryMappedFileLocation(len); - - void* privateMapAddress = NULL; - int current_retry = 0; - - while (true) { - privateMapAddress = MapViewOfFileEx(maphandle, // file mapping handle - FILE_MAP_READ, // access - 0, - 0, // file offset, high and low - 0, // bytes to map, 0 == all - thisAddress); // address to place file - - if (privateMapAddress == 0) { - DWORD dosError = GetLastError(); - - ++current_retry; - - // If we failed to allocate a memory mapped file, try again in case we picked - // an address that Windows is also trying to use for some other VM allocations - if (dosError == ERROR_INVALID_ADDRESS && current_retry < 5) { - continue; - } - - severe() << "MapViewOfFileEx for " << filename() << " failed with error " - << errnoWithDescription(dosError) << " (file size is " << len << ")" - << " in MemoryMappedFile::createPrivateMap" << endl; - - fassertFailed(16167); - } - - break; - } - - views.push_back(privateMapAddress); - return privateMapAddress; -} - -void* MemoryMappedFile::remapPrivateView(OperationContext* opCtx, void* oldPrivateAddr) { - LockMongoFilesExclusive lockMongoFiles(opCtx); - - privateViews.clearWritableBits(oldPrivateAddr, len); - - stdx::lock_guard<stdx::mutex> lk(mapViewMutex); - - if (!UnmapViewOfFile(oldPrivateAddr)) { - DWORD dosError = GetLastError(); - severe() << "UnMapViewOfFile for " << filename() << " failed with error " - << errnoWithDescription(dosError) << " in MemoryMappedFile::remapPrivateView" - << endl; - fassertFailed(16168); - } - - void* newPrivateView = - MapViewOfFileEx(maphandle, // file mapping handle - FILE_MAP_READ, // access - 0, - 0, // file offset, high and low - 0, // bytes to map, 0 == all - oldPrivateAddr); // we want the same address we had before - if (0 == newPrivateView) { - DWORD dosError = GetLastError(); - severe() << "MapViewOfFileEx for " << filename() << " failed with error " - << errnoWithDescription(dosError) << " (file size is " << len << ")" - << " in MemoryMappedFile::remapPrivateView" << endl; - } - fassert(16148, newPrivateView == oldPrivateAddr); - return newPrivateView; -} - -class WindowsFlushable : public MemoryMappedFile::Flushable { -public: - WindowsFlushable(MemoryMappedFile* theFile, - void* view, - HANDLE fd, - const uint64_t id, - const std::string& filename, - stdx::mutex& flushMutex) - : _theFile(theFile), - _view(view), - _fd(fd), - _id(id), - _filename(filename), - _flushMutex(flushMutex) {} - - void flush(OperationContext* opCtx) { - if (!_view || !_fd) - return; - - { - LockMongoFilesShared mmfilesLock(opCtx); - - std::set<MongoFile*> mmfs = MongoFile::getAllFiles(); - std::set<MongoFile*>::const_iterator it = mmfs.find(_theFile); - if (it == mmfs.end() || (*it)->getUniqueId() != _id) { - // this was deleted while we were unlocked - return; - } - - // Hold the flush mutex to ensure the file is not closed during flush - _flushMutex.lock(); - } - - stdx::lock_guard<stdx::mutex> lk(_flushMutex, stdx::adopt_lock); - - int loopCount = 0; - bool success = false; - bool timeout = false; - int dosError = ERROR_SUCCESS; - const int maximumTimeInSeconds = 60 * 15; - Timer t; - while (!success && !timeout) { - ++loopCount; - success = FALSE != FlushViewOfFile(_view, 0); - if (!success) { - dosError = GetLastError(); - if (dosError != ERROR_LOCK_VIOLATION) { - break; - } - timeout = t.seconds() > maximumTimeInSeconds; - } - } - if (success && loopCount > 1) { - log() << "FlushViewOfFile for " << _filename << " succeeded after " << loopCount - << " attempts taking " << t.millis() << "ms" << endl; - } else if (!success) { - log() << "FlushViewOfFile for " << _filename << " failed with error " << dosError - << " after " << loopCount << " attempts taking " << t.millis() << "ms" << endl; - // Abort here to avoid data corruption - fassert(16387, false); - } - - success = FALSE != FlushFileBuffers(_fd); - if (!success) { - int err = GetLastError(); - log() << "FlushFileBuffers failed: " << errnoWithDescription(err) - << " file: " << _filename << endl; - dataSyncFailedHandler(); - } - } - - MemoryMappedFile* _theFile; // this may be deleted while we are running - void* _view; - HANDLE _fd; - const uint64_t _id; - string _filename; - stdx::mutex& _flushMutex; -}; - -void MemoryMappedFile::flush(bool sync) { - invariant(!(isOptionSet(Options::READONLY))); - uassert(13056, "Async flushing not supported on windows", sync); - if (!views.empty()) { - WindowsFlushable f(this, viewForFlushing(), fd, _uniqueId, filename(), _flushMutex); - auto opCtx = cc().getOperationContext(); - invariant(opCtx); - f.flush(opCtx); - } -} - -MemoryMappedFile::Flushable* MemoryMappedFile::prepareFlush() { - return new WindowsFlushable(this, viewForFlushing(), fd, _uniqueId, filename(), _flushMutex); -} -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/paths.cpp b/src/mongo/db/storage/mmap_v1/paths.cpp deleted file mode 100644 index 8e17c6cf716..00000000000 --- a/src/mongo/db/storage/mmap_v1/paths.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* Copyright 2010 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects - * for all of the code used other than as permitted herein. If you modify - * file(s) with this exception, you may extend this exception to your - * version of the file(s), but you are not obligated to do so. If you do not - * wish to do so, delete this exception statement from your version. If you - * delete this exception statement from all source files in the program, - * then also delete it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/paths.h" - -#include "mongo/util/log.h" - -namespace mongo { - -/** from a full path */ -RelativePath RelativePath::fromFullPath(boost::filesystem::path dbp, boost::filesystem::path f) { - // filesystem::path normalizes / and backslash - std::string fullpath = f.string(); - std::string relative = str::after(fullpath, dbp.string()); - if (relative.empty()) { - log() << "warning file is not under db path? " << fullpath << ' ' << dbp.string(); - RelativePath rp; - rp._p = fullpath; - return rp; - } - if (str::startsWith(relative, "/") || str::startsWith(relative, "\\")) { - relative.erase(0, 1); - } - RelativePath rp; - rp._p = relative; - return rp; -} - -dev_t getPartition(const std::string& path) { - struct stat stats; - - if (stat(path.c_str(), &stats) != 0) { - uasserted(13646, - str::stream() << "stat() failed for file: " << path << " " - << errnoWithDescription()); - } - - return stats.st_dev; -} - -void flushMyDirectory(const boost::filesystem::path& file) { -#ifdef __linux__ // this isn't needed elsewhere - static bool _warnedAboutFilesystem = false; - // if called without a fully qualified path it asserts; that makes mongoperf fail. - // so make a warning. need a better solution longer term. - // massert(13652, str::stream() << "Couldn't find parent dir for file: " << file.string(),); - if (!file.has_branch_path()) { - log() << "warning flushMyDirectory couldn't find parent dir for file: " << file.string(); - return; - } - - - boost::filesystem::path dir = file.branch_path(); // parent_path in new boosts - - LOG(1) << "flushing directory " << dir.string(); - - int fd = ::open(dir.string().c_str(), O_RDONLY); // DO NOT THROW OR ASSERT BEFORE CLOSING - massert(13650, - str::stream() << "Couldn't open directory '" << dir.string() << "' for flushing: " - << errnoWithDescription(), - fd >= 0); - if (fsync(fd) != 0) { - int e = errno; - if (e == EINVAL) { // indicates filesystem does not support synchronization - if (!_warnedAboutFilesystem) { - log() << "\tWARNING: This file system is not supported. For further information" - << " see:" << startupWarningsLog; - log() << "\t\t\thttp://dochub.mongodb.org/core/unsupported-filesystems" - << startupWarningsLog; - log() << "\t\tPlease notify MongoDB, Inc. if an unlisted filesystem generated " - << "this warning." << startupWarningsLog; - _warnedAboutFilesystem = true; - } - } else { - close(fd); - massert(13651, - str::stream() << "Couldn't fsync directory '" << dir.string() << "': " - << errnoWithDescription(e), - false); - } - } - close(fd); -#endif -} -} diff --git a/src/mongo/db/storage/mmap_v1/paths.h b/src/mongo/db/storage/mmap_v1/paths.h deleted file mode 100644 index 384b6459419..00000000000 --- a/src/mongo/db/storage/mmap_v1/paths.h +++ /dev/null @@ -1,100 +0,0 @@ -// @file paths.h -// file paths and directory handling - -/* Copyright 2010 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects - * for all of the code used other than as permitted herein. If you modify - * file(s) with this exception, you may extend this exception to your - * version of the file(s), but you are not obligated to do so. If you do not - * wish to do so, delete this exception statement from your version. If you - * delete this exception statement from all source files in the program, - * then also delete it in the license file. - */ - -#pragma once - -#include <boost/filesystem/path.hpp> -#include <fcntl.h> -#include <sys/stat.h> -#include <sys/types.h> - -#include "mongo/util/mongoutils/str.h" - -#include "mongo/db/storage/storage_options.h" - -namespace mongo { - -using namespace mongoutils; - -/** this is very much like a boost::path. however, we define a new type to get some type - checking. if you want to say 'my param MUST be a relative path", use this. -*/ -struct RelativePath { - std::string _p; - - bool empty() const { - return _p.empty(); - } - - static RelativePath fromRelativePath(const std::string& f) { - RelativePath rp; - rp._p = f; - return rp; - } - - /** - * Returns path relative to 'dbpath' from a full path 'f'. - */ - static RelativePath fromFullPath(boost::filesystem::path dbpath, boost::filesystem::path f); - - std::string toString() const { - return _p; - } - - bool operator!=(const RelativePath& r) const { - return _p != r._p; - } - bool operator==(const RelativePath& r) const { - return _p == r._p; - } - bool operator<(const RelativePath& r) const { - return _p < r._p; - } - - std::string asFullPath() const { - boost::filesystem::path x(storageGlobalParams.dbpath); - x /= _p; - return x.string(); - } -}; - -dev_t getPartition(const std::string& path); - -inline bool onSamePartition(const std::string& path1, const std::string& path2) { - dev_t dev1 = getPartition(path1); - dev_t dev2 = getPartition(path2); - - return dev1 == dev2; -} - -void flushMyDirectory(const boost::filesystem::path& file); - -boost::filesystem::path ensureParentDirCreated(const boost::filesystem::path& p); -} diff --git a/src/mongo/db/storage/mmap_v1/record.h b/src/mongo/db/storage/mmap_v1/record.h deleted file mode 100644 index 401808742a9..00000000000 --- a/src/mongo/db/storage/mmap_v1/record.h +++ /dev/null @@ -1,181 +0,0 @@ -// database.h - -/** -* Copyright (C) 2008 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include "mongo/base/static_assert.h" -#include "mongo/bson/bsonobjbuilder.h" -#include "mongo/db/storage/mmap_v1/diskloc.h" -#include "mongo/db/storage/record_data.h" -#include "mongo/platform/atomic_word.h" - -namespace mongo { - -class DeletedRecord; - -/* MmapV1RecordHeader is a record in a datafile. DeletedRecord is similar but for deleted space. - -*11:03:20 AM) dm10gen: regarding extentOfs... -(11:03:42 AM) dm10gen: an extent is a continugous disk area, which contains many Records and - DeleteRecords -(11:03:56 AM) dm10gen: a DiskLoc has two pieces, the fileno and ofs. (64 bit total) -(11:04:16 AM) dm10gen: to keep the headesr small, instead of storing a 64 bit ptr to the full extent - address, we keep just the offset -(11:04:29 AM) dm10gen: we can do this as we know the record's address, and it has the same fileNo -(11:04:33 AM) dm10gen: see class DiskLoc for more info -(11:04:43 AM) dm10gen: so that is how MmapV1RecordHeader::myExtent() works -(11:04:53 AM) dm10gen: on an alloc(), when we build a new MmapV1RecordHeader, we must populate its - extentOfs then -*/ -#pragma pack(1) -class MmapV1RecordHeader { -public: - enum HeaderSizeValue { HeaderSize = 16 }; - - int lengthWithHeaders() const { - return _lengthWithHeaders; - } - int& lengthWithHeaders() { - return _lengthWithHeaders; - } - - int extentOfs() const { - return _extentOfs; - } - int& extentOfs() { - return _extentOfs; - } - - int nextOfs() const { - return _nextOfs; - } - int& nextOfs() { - return _nextOfs; - } - - int prevOfs() const { - return _prevOfs; - } - int& prevOfs() { - return _prevOfs; - } - - const char* data() const { - return _data; - } - char* data() { - return _data; - } - - // XXX remove - const char* dataNoThrowing() const { - return _data; - } - char* dataNoThrowing() { - return _data; - } - - int netLength() const { - return _netLength(); - } - - /* use this when a record is deleted. basically a union with next/prev fields */ - DeletedRecord& asDeleted() { - return *((DeletedRecord*)this); - } - - DiskLoc myExtentLoc(const DiskLoc& myLoc) const { - return DiskLoc(myLoc.a(), extentOfs()); - } - - struct NP { - int nextOfs; - int prevOfs; - }; - - NP* np() { - return (NP*)&_nextOfs; - } - - RecordData toRecordData() const { - return RecordData(_data, _netLength()); - } - -private: - int _netLength() const { - return _lengthWithHeaders - HeaderSize; - } - - int _lengthWithHeaders; - int _extentOfs; - int _nextOfs; - int _prevOfs; - - /** be careful when referencing this that your write intent was correct */ - char _data[4]; - -public: - static bool MemoryTrackingEnabled; -}; -#pragma pack() - -// TODO: this probably moves to record_store.h -class DeletedRecord { -public: - int lengthWithHeaders() const { - return _lengthWithHeaders; - } - int& lengthWithHeaders() { - return _lengthWithHeaders; - } - - int extentOfs() const { - return _extentOfs; - } - int& extentOfs() { - return _extentOfs; - } - - // TODO: we need to not const_cast here but problem is DiskLoc::writing - DiskLoc& nextDeleted() const { - return const_cast<DiskLoc&>(_nextDeleted); - } - -private: - int _lengthWithHeaders; - - int _extentOfs; - - DiskLoc _nextDeleted; -}; - -MONGO_STATIC_ASSERT(16 == sizeof(DeletedRecord)); - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/record_access_tracker.cpp b/src/mongo/db/storage/mmap_v1/record_access_tracker.cpp deleted file mode 100644 index 1d55d272efc..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_access_tracker.cpp +++ /dev/null @@ -1,338 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/record_access_tracker.h" - -#include <cstring> - -#include "mongo/base/init.h" -#include "mongo/config.h" -#include "mongo/db/storage/mmap_v1/record.h" -#include "mongo/platform/bits.h" -#include "mongo/stdx/memory.h" -#include "mongo/util/clock_source.h" -#include "mongo/util/debug_util.h" -#include "mongo/util/processinfo.h" - -namespace mongo { - -namespace { - -static bool blockSupported = false; - -MONGO_INITIALIZER(RecordBlockSupported)(InitializerContext* cx) { - blockSupported = ProcessInfo::blockCheckSupported(); - return Status::OK(); -} - -int hash(size_t region) { - return abs(((7 + (int)(region & 0xFFFF)) * (11 + (int)((region >> 16) & 0xFFFF)) -#if defined(_WIN64) || defined(__amd64__) - * - (13 + (int)((region >> 32) & 0xFFFF)) * (17 + (int)((region >> 48) & 0xFFFF)) -#endif - ) % - RecordAccessTracker::SliceSize); -} - -int bigHash(size_t region) { - return hash(region) % RecordAccessTracker::BigHashSize; -} - -namespace PointerTable { - -/* A "superpage" is a group of 16 contiguous pages that differ - * only in the low-order 16 bits. This means that there is - * enough room in the low-order bits to store a bitmap for each - * page in the superpage. - */ -static const size_t superpageMask = ~0xffffLL; -static const size_t superpageShift = 16; -static const size_t pageSelectorMask = 0xf000LL; // selects a page in a superpage -static const int pageSelectorShift = 12; - -// Tunables -static const int capacity = 128; // in superpages -static const int bucketSize = 4; // half cache line -static const int buckets = capacity / bucketSize; - -struct Data { - /** organized similar to a CPU cache - * bucketSize-way set associative - * least-recently-inserted replacement policy - */ - size_t _table[buckets][bucketSize]; - long long _lastReset; // time in millis -}; - -void reset(Data* data, ClockSource* cs) { - memset(data->_table, 0, sizeof(data->_table)); - data->_lastReset = cs->now().toMillisSinceEpoch(); -} - -inline void resetIfNeeded(Data* data, ClockSource* cs) { - const long long sinceReset = cs->now().toMillisSinceEpoch() - data->_lastReset; - if (MONGO_unlikely(sinceReset > RecordAccessTracker::RotateTimeSecs * 1000)) { - reset(data, cs); - } -} - -inline size_t pageBitOf(size_t ptr) { - return 1LL << ((ptr & pageSelectorMask) >> pageSelectorShift); -} - -inline size_t superpageOf(size_t ptr) { - return ptr & superpageMask; -} - -inline size_t bucketFor(size_t ptr) { - return (ptr >> superpageShift) % buckets; -} - -inline bool haveSeenPage(size_t superpage, size_t ptr) { - return superpage & pageBitOf(ptr); -} - -inline void markPageSeen(size_t& superpage, size_t ptr) { - superpage |= pageBitOf(ptr); -} - -/** call this to check a page has been seen yet. */ -inline bool seen(Data* data, size_t ptr, ClockSource* cs) { - resetIfNeeded(data, cs); - - // A bucket contains 4 superpages each containing 16 contiguous pages - // See above for a more detailed explanation of superpages - size_t* bucket = data->_table[bucketFor(ptr)]; - - for (int i = 0; i < bucketSize; i++) { - if (superpageOf(ptr) == superpageOf(bucket[i])) { - if (haveSeenPage(bucket[i], ptr)) - return true; - - markPageSeen(bucket[i], ptr); - return false; - } - } - - // superpage isn't in thread-local cache - // slide bucket forward and add new superpage at front - for (int i = bucketSize - 1; i > 0; i--) - bucket[i] = bucket[i - 1]; - - bucket[0] = superpageOf(ptr); - markPageSeen(bucket[0], ptr); - - return false; -} - -Data* getData(); - -}; // namespace PointerTable - -} // namespace - -// -// Slice -// - -RecordAccessTracker::Slice::Slice() { - reset(); -} - -void RecordAccessTracker::Slice::reset() { - memset(_data, 0, sizeof(_data)); -} - -RecordAccessTracker::State RecordAccessTracker::Slice::get(int regionHash, - size_t region, - short offset) { - DEV verify(hash(region) == regionHash); - - Entry* e = _get(regionHash, region, false); - if (!e) - return Unk; - - return (e->value & (1ULL << offset)) ? In : Out; -} - -bool RecordAccessTracker::Slice::put(int regionHash, size_t region, short offset) { - DEV verify(hash(region) == regionHash); - - Entry* e = _get(regionHash, region, true); - if (!e) - return false; - - e->value |= 1ULL << offset; - return true; -} - -RecordAccessTracker::Entry* RecordAccessTracker::Slice::_get(int start, size_t region, bool add) { - for (int i = 0; i < MaxChain; i++) { - int bucket = (start + i) % SliceSize; - - if (_data[bucket].region == 0) { - if (!add) - return NULL; - - _data[bucket].region = region; - return &_data[bucket]; - } - - if (_data[bucket].region == region) { - return &_data[bucket]; - } - } - - return NULL; -} - -// -// Rolling -// - -bool RecordAccessTracker::Rolling::access(size_t region, - short offset, - bool doHalf, - ClockSource* cs) { - int regionHash = hash(region); - - stdx::lock_guard<SimpleMutex> lk(_lock); - - static int rarelyCount = 0; - if (rarelyCount++ % (2048 / BigHashSize) == 0) { - Date_t now = cs->now(); - - if (now - _lastRotate > Seconds(static_cast<int64_t>(RotateTimeSecs))) { - _rotate(cs); - } - } - - for (int i = 0; i < NumSlices / (doHalf ? 2 : 1); i++) { - int pos = (_curSlice + i) % NumSlices; - State s = _slices[pos].get(regionHash, region, offset); - - if (s == In) - return true; - - if (s == Out) { - _slices[pos].put(regionHash, region, offset); - return false; - } - } - - // we weren't in any slice - // so add to cur - if (!_slices[_curSlice].put(regionHash, region, offset)) { - _rotate(cs); - _slices[_curSlice].put(regionHash, region, offset); - } - return false; -} - -void RecordAccessTracker::Rolling::updateLastRotate(ClockSource* cs) { - _lastRotate = cs->now(); -} - -void RecordAccessTracker::Rolling::_rotate(ClockSource* cs) { - _curSlice = (_curSlice + 1) % NumSlices; - _slices[_curSlice].reset(); - updateLastRotate(cs); -} - -PointerTable::Data* PointerTable::getData() { - thread_local std::unique_ptr<PointerTable::Data> data; - if (!data) - data = stdx::make_unique<PointerTable::Data>(); - return data.get(); -} - -// -// RecordAccessTracker -// - -RecordAccessTracker::RecordAccessTracker(ClockSource* cs) - : _blockSupported(blockSupported), _clock(cs) { - reset(); -} - -void RecordAccessTracker::reset() { - PointerTable::reset(PointerTable::getData(), _clock); - _rollingTable.reset(new Rolling[BigHashSize]); - for (int i = 0; i < BigHashSize; i++) { - _rollingTable[i].updateLastRotate(_clock); - } -} - -void RecordAccessTracker::markAccessed(const void* record) { - const size_t page = reinterpret_cast<size_t>(record) >> 12; - const size_t region = page >> 6; - const size_t offset = page & 0x3f; - - const bool seen = - PointerTable::seen(PointerTable::getData(), reinterpret_cast<size_t>(record), _clock); - if (!seen) { - _rollingTable[bigHash(region)].access(region, offset, true, _clock); - } -} - - -bool RecordAccessTracker::checkAccessedAndMark(const void* record) { - const size_t page = reinterpret_cast<size_t>(record) >> 12; - const size_t region = page >> 6; - const size_t offset = page & 0x3f; - - // This is like the "L1 cache". If we're a miss then we fall through and check the - // "L2 cache". If we're still a miss, then we defer to a system-specific system - // call (or give up and return false if deferring to the system call is not enabled). - if (PointerTable::seen(PointerTable::getData(), reinterpret_cast<size_t>(record), _clock)) { - return true; - } - - // We were a miss in the PointerTable. See if we can find 'record' in the Rolling table. - if (_rollingTable[bigHash(region)].access(region, offset, false, _clock)) { - return true; - } - - if (!_blockSupported) { - // This means we don't fall back to a system call. Instead we assume things aren't - // in memory. This could mean that we yield too much, but this is much better - // than the alternative of not yielding through a page fault. - return false; - } - - return ProcessInfo::blockInMemory(const_cast<void*>(record)); -} - -void RecordAccessTracker::disableSystemBlockInMemCheck() { - _blockSupported = false; -} - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/record_access_tracker.h b/src/mongo/db/storage/mmap_v1/record_access_tracker.h deleted file mode 100644 index c4ec579c720..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_access_tracker.h +++ /dev/null @@ -1,165 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include <memory> - -#include "mongo/util/concurrency/mutex.h" -#include "mongo/util/time_support.h" - -namespace mongo { - -class ClockSource; -class Date_t; -class MmapV1RecordHeader; - -/** - * Used to implement likelyInPhysicalMemory() for the MMAP v1 storage engine. Since - * MMAP v1 holds exclusive collection-level locks, it should yield the locks during a - * page fault. The RecordAccessTracker is used to guess at which records are in memory, - * so that a yield can be requested unless we're sure that the record has been - * recently accessed. - */ -class RecordAccessTracker { - MONGO_DISALLOW_COPYING(RecordAccessTracker); - -public: - RecordAccessTracker(ClockSource* cs); - - enum Constants { - SliceSize = 1024, - MaxChain = 20, // intentionally very low - NumSlices = 10, - RotateTimeSecs = 90, - BigHashSize = 128 - }; - - /** - * Informs this record access tracker that 'record' has been accessed. - */ - void markAccessed(const void* record); - - /** - * @return whether or not 'record' has been marked as accessed recently. A return value - * of true means that 'record' is likely in physical memory. - * - * Also has the side effect of marking 'record' as accessed. - */ - bool checkAccessedAndMark(const void* record); - - /** - * Clears out any history of record accesses. - */ - void reset(); - - // - // For testing. - // - - /** - * The accessedRecently() implementation falls back to making a system call if it - * appears that the record is not in physical memory. Use this method to disable - * the fallback for testing. - */ - void disableSystemBlockInMemCheck(); - -private: - enum State { In, Out, Unk }; - - struct Entry { - size_t region; - unsigned long long value; - }; - - /** - * simple hash map for region -> status - * this constitutes a single region of time - * it does chaining, but very short chains - */ - class Slice { - public: - Slice(); - - void reset(); - - State get(int regionHash, size_t region, short offset); - - /** - * @return true if added, false if full - */ - bool put(int regionHash, size_t region, short offset); - - private: - Entry* _get(int start, size_t region, bool add); - - Entry _data[SliceSize]; - }; - - /** - * this contains many slices of times - * the idea you put mem status in the current time slice - * and then after a certain period of time, it rolls off so we check again - */ - class Rolling { - public: - Rolling() = default; - - /** - * After this call, we assume the page is in RAM. - * - * @param doHalf if this is a known good access, want to put in first half. - * - * @return whether we know the page is in RAM - */ - bool access(size_t region, short offset, bool doHalf, ClockSource* cs); - - /** - * Updates _lastRotate to the current time. - */ - void updateLastRotate(ClockSource* cs); - - private: - void _rotate(ClockSource* cs); - - int _curSlice = 0; - Date_t _lastRotate; - Slice _slices[NumSlices]; - - SimpleMutex _lock; - }; - - // Should this record tracker fallback to making a system call? - bool _blockSupported; - ClockSource* _clock; - - // An array of Rolling instances for tracking record accesses. - std::unique_ptr<Rolling[]> _rollingTable; -}; - -} // namespace diff --git a/src/mongo/db/storage/mmap_v1/record_access_tracker_test.cpp b/src/mongo/db/storage/mmap_v1/record_access_tracker_test.cpp deleted file mode 100644 index 7b5c13a1029..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_access_tracker_test.cpp +++ /dev/null @@ -1,150 +0,0 @@ -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/storage/mmap_v1/record_access_tracker.h" - -#include "mongo/db/storage/mmap_v1/record.h" -#include "mongo/stdx/memory.h" -#include "mongo/unittest/unittest.h" -#include "mongo/util/clock_source_mock.h" - -using namespace mongo; - -namespace { - -const std::unique_ptr<ClockSource> clock = stdx::make_unique<ClockSourceMock>(); - -const void* pointerOf(int data) { -#pragma warning(push) -// C4312: 'reinterpret_cast': conversion from 'int' to 'const void *' of greater size -#pragma warning(disable : 4312) - return reinterpret_cast<const void*>(data); -#pragma warning(pop) -} - -TEST(RecordAccessTrackerTest, TouchRecordTwice) { - RecordAccessTracker tracker(clock.get()); - tracker.disableSystemBlockInMemCheck(); - - const void* record = pointerOf(0x10003); - - ASSERT_FALSE(tracker.checkAccessedAndMark(record)); - ASSERT_TRUE(tracker.checkAccessedAndMark(record)); -} - -TEST(RecordAccessTrackerTest, TouchPageTwice) { - RecordAccessTracker tracker(clock.get()); - tracker.disableSystemBlockInMemCheck(); - - const void* firstRecord = pointerOf(0x10003); - const void* secondRecord = pointerOf(0x10004); - - ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecord)); - ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecord)); - ASSERT_TRUE(tracker.checkAccessedAndMark(firstRecord)); - ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecord)); -} - -TEST(RecordAccessTrackerTest, TouchTwoPagesTwice) { - RecordAccessTracker tracker(clock.get()); - tracker.disableSystemBlockInMemCheck(); - - const void* firstRecordFirstPage = pointerOf(0x11000); - const void* secondRecordFirstPage = pointerOf(0x11100); - - const void* firstRecordSecondPage = pointerOf(0x12000); - const void* secondRecordSecondPage = pointerOf(0x12100); - - ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordFirstPage)); - ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordSecondPage)); - ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordFirstPage)); - ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordSecondPage)); -} - -// Tests RecordAccessTracker::reset(). -TEST(RecordAccessTrackerTest, TouchTwoPagesTwiceWithReset) { - RecordAccessTracker tracker(clock.get()); - tracker.disableSystemBlockInMemCheck(); - - const void* firstRecordFirstPage = pointerOf(0x11000); - const void* secondRecordFirstPage = pointerOf(0x11100); - - const void* firstRecordSecondPage = pointerOf(0x12000); - const void* secondRecordSecondPage = pointerOf(0x12100); - - ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordFirstPage)); - ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordSecondPage)); - ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordFirstPage)); - ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordSecondPage)); - - // Now reset and make sure things look as though we have a fresh RecordAccessTracker. - tracker.reset(); - ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordFirstPage)); - ASSERT_FALSE(tracker.checkAccessedAndMark(firstRecordSecondPage)); - ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordFirstPage)); - ASSERT_TRUE(tracker.checkAccessedAndMark(secondRecordSecondPage)); -} - -// Tests RecordAccessTracker::markAccessed(). -TEST(RecordAccessTrackerTest, AccessTest) { - RecordAccessTracker tracker(clock.get()); - tracker.disableSystemBlockInMemCheck(); - - // Mark the first page in superpage 3 as accessed. - const void* record = pointerOf(0x30000); - tracker.markAccessed(record); - - // Test that all remaining addresses in the page give true when asked whether they are - // recently accessed. - for (int i = 0x30001; i < 0x31000; i++) { - const void* touchedPageRecord = pointerOf(i); - ASSERT_TRUE(tracker.checkAccessedAndMark(touchedPageRecord)); - } -} - -// Touch pages in 128 separate superpages, and make sure that they all are reported as -// recently accessed. -TEST(RecordAccessTrackerTest, Access128Superpages) { - RecordAccessTracker tracker(clock.get()); - tracker.disableSystemBlockInMemCheck(); - - // Touch the pages. - for (int i = 0x00000; i < 0x800000; i += 0x10000) { - const void* touchedPageRecord = pointerOf(i); - tracker.markAccessed(touchedPageRecord); - } - - // Ensure we know that the pages have all been touched. - for (int i = 0x00000; i < 0x800000; i += 0x10000) { - // It should be fine if there is an offset of, say, 0xA, into the page. - const void* touchedPageRecord = pointerOf(i + 0xA); - ASSERT_TRUE(tracker.checkAccessedAndMark(touchedPageRecord)); - } -} - -} // namespace diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp deleted file mode 100644 index 6bfcaefcdde..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_store_v1_base.cpp +++ /dev/null @@ -1,962 +0,0 @@ -// record_store_v1_base.cpp - -/** - * Copyright (C) 2013-2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/record_store_v1_base.h" - -#include "mongo/base/static_assert.h" -#include "mongo/db/catalog/collection.h" -#include "mongo/db/client.h" -#include "mongo/db/curop.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/storage/mmap_v1/extent.h" -#include "mongo/db/storage/mmap_v1/extent_manager.h" -#include "mongo/db/storage/mmap_v1/record.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h" -#include "mongo/db/storage/mmap_v1/touch_pages.h" -#include "mongo/stdx/memory.h" -#include "mongo/util/log.h" -#include "mongo/util/progress_meter.h" -#include "mongo/util/timer.h" - -namespace mongo { - -using std::unique_ptr; -using std::set; -using std::string; - -/* Deleted list buckets are used to quickly locate free space based on size. Each bucket - contains records up to that size (meaning a record with a size exactly equal to - bucketSizes[n] would go into bucket n+1). -*/ -const int RecordStoreV1Base::bucketSizes[] = { - 0x20, - 0x40, - 0x80, - 0x100, // 32, 64, 128, 256 - 0x200, - 0x400, - 0x800, - 0x1000, // 512, 1K, 2K, 4K - 0x2000, - 0x4000, - 0x8000, - 0x10000, // 8K, 16K, 32K, 64K - 0x20000, - 0x40000, - 0x80000, - 0x100000, // 128K, 256K, 512K, 1M - 0x200000, - 0x400000, - 0x600000, - 0x800000, // 2M, 4M, 6M, 8M - 0xA00000, - 0xC00000, - 0xE00000, // 10M, 12M, 14M, - MaxAllowedAllocation, // 16.5M - MaxAllowedAllocation + 1, // Only MaxAllowedAllocation sized records go here. - INT_MAX, // "oversized" bucket for unused parts of extents. -}; - -// If this fails, it means that bucketSizes doesn't have the correct number of entries. -MONGO_STATIC_ASSERT(sizeof(RecordStoreV1Base::bucketSizes) / - sizeof(RecordStoreV1Base::bucketSizes[0]) == - RecordStoreV1Base::Buckets); - -SavedCursorRegistry::~SavedCursorRegistry() { - for (SavedCursorSet::iterator it = _cursors.begin(); it != _cursors.end(); it++) { - (*it)->_registry = NULL; // prevent SavedCursor destructor from accessing this - } -} - -void SavedCursorRegistry::registerCursor(SavedCursor* cursor) { - invariant(!cursor->_registry); - cursor->_registry = this; - scoped_spinlock lock(_mutex); - _cursors.insert(cursor); -} - -bool SavedCursorRegistry::unregisterCursor(SavedCursor* cursor) { - if (!cursor->_registry) { - return false; - } - invariant(cursor->_registry == this); - cursor->_registry = NULL; - scoped_spinlock lock(_mutex); - invariant(_cursors.erase(cursor)); - return true; -} - -void SavedCursorRegistry::invalidateCursorsForBucket(DiskLoc bucket) { - // While this is not strictly necessary as an exclusive collection lock will be held, - // it's cleaner to just make the SavedCursorRegistry thread-safe. Spinlock is OK here. - scoped_spinlock lock(_mutex); - for (SavedCursorSet::iterator it = _cursors.begin(); it != _cursors.end();) { - if ((*it)->bucket == bucket) { - (*it)->_registry = NULL; // prevent ~SavedCursor from trying to unregister - _cursors.erase(it++); - } else { - it++; - } - } -} - -RecordStoreV1Base::RecordStoreV1Base(StringData ns, - RecordStoreV1MetaData* details, - ExtentManager* em, - bool isSystemIndexes) - : RecordStore(ns), _details(details), _extentManager(em), _isSystemIndexes(isSystemIndexes) {} - -RecordStoreV1Base::~RecordStoreV1Base() {} - - -int64_t RecordStoreV1Base::storageSize(OperationContext* opCtx, - BSONObjBuilder* extraInfo, - int level) const { - BSONArrayBuilder extentInfo; - - int64_t total = 0; - int n = 0; - - DiskLoc cur = _details->firstExtent(opCtx); - - while (!cur.isNull()) { - Extent* e = _extentManager->getExtent(cur); - - total += e->length; - n++; - - if (extraInfo && level > 0) { - extentInfo.append(BSON("len" << e->length << "loc: " << e->myLoc.toBSONObj())); - } - cur = e->xnext; - } - - if (extraInfo) { - extraInfo->append("numExtents", n); - if (level > 0) - extraInfo->append("extents", extentInfo.arr()); - } - - return total; -} - -RecordData RecordStoreV1Base::dataFor(OperationContext* opCtx, const RecordId& loc) const { - return recordFor(DiskLoc::fromRecordId(loc))->toRecordData(); -} - -bool RecordStoreV1Base::findRecord(OperationContext* opCtx, - const RecordId& loc, - RecordData* rd) const { - // this is a bit odd, as the semantics of using the storage engine imply it _has_ to be. - // And in fact we can't actually check. - // So we assume the best. - MmapV1RecordHeader* rec = recordFor(DiskLoc::fromRecordId(loc)); - if (!rec) { - return false; - } - *rd = rec->toRecordData(); - return true; -} - -MmapV1RecordHeader* RecordStoreV1Base::recordFor(const DiskLoc& loc) const { - return _extentManager->recordForV1(loc); -} - -const DeletedRecord* RecordStoreV1Base::deletedRecordFor(const DiskLoc& loc) const { - invariant(loc.a() != -1); - return reinterpret_cast<const DeletedRecord*>(recordFor(loc)); -} - -DeletedRecord* RecordStoreV1Base::drec(const DiskLoc& loc) const { - invariant(loc.a() != -1); - return reinterpret_cast<DeletedRecord*>(recordFor(loc)); -} - -Extent* RecordStoreV1Base::_getExtent(OperationContext* opCtx, const DiskLoc& loc) const { - return _extentManager->getExtent(loc); -} - -DiskLoc RecordStoreV1Base::_getExtentLocForRecord(OperationContext* opCtx, - const DiskLoc& loc) const { - return _extentManager->extentLocForV1(loc); -} - - -DiskLoc RecordStoreV1Base::getNextRecord(OperationContext* opCtx, const DiskLoc& loc) const { - DiskLoc next = getNextRecordInExtent(opCtx, loc); - if (!next.isNull()) { - return next; - } - - // now traverse extents - - Extent* e = _getExtent(opCtx, _getExtentLocForRecord(opCtx, loc)); - while (1) { - if (e->xnext.isNull()) - return DiskLoc(); // end of collection - e = _getExtent(opCtx, e->xnext); - if (!e->firstRecord.isNull()) - break; - // entire extent could be empty, keep looking - } - return e->firstRecord; -} - -DiskLoc RecordStoreV1Base::getPrevRecord(OperationContext* opCtx, const DiskLoc& loc) const { - DiskLoc prev = getPrevRecordInExtent(opCtx, loc); - if (!prev.isNull()) { - return prev; - } - - // now traverse extents - - Extent* e = _getExtent(opCtx, _getExtentLocForRecord(opCtx, loc)); - while (1) { - if (e->xprev.isNull()) - return DiskLoc(); // end of collection - e = _getExtent(opCtx, e->xprev); - if (!e->firstRecord.isNull()) - break; - // entire extent could be empty, keep looking - } - return e->lastRecord; -} - -DiskLoc RecordStoreV1Base::_findFirstSpot(OperationContext* opCtx, - const DiskLoc& extDiskLoc, - Extent* e) { - DiskLoc emptyLoc = extDiskLoc; - emptyLoc.inc(Extent::HeaderSize()); - int delRecLength = e->length - Extent::HeaderSize(); - if (delRecLength >= 32 * 1024 && NamespaceString::virtualized(_ns) && !isCapped()) { - // probably an index. so skip forward to keep its records page aligned - int& ofs = emptyLoc.GETOFS(); - int newOfs = (ofs + 0xfff) & ~0xfff; - delRecLength -= (newOfs - ofs); - dassert(delRecLength > 0); - ofs = newOfs; - } - - DeletedRecord* empty = opCtx->recoveryUnit()->writing(drec(emptyLoc)); - empty->lengthWithHeaders() = delRecLength; - empty->extentOfs() = e->myLoc.getOfs(); - empty->nextDeleted().Null(); - return emptyLoc; -} - -DiskLoc RecordStoreV1Base::getNextRecordInExtent(OperationContext* opCtx, - const DiskLoc& loc) const { - int nextOffset = recordFor(loc)->nextOfs(); - - if (nextOffset == DiskLoc::NullOfs) - return DiskLoc(); - - fassert(17441, abs(nextOffset) >= 8); // defensive - DiskLoc result(loc.a(), nextOffset); - return result; -} - -DiskLoc RecordStoreV1Base::getPrevRecordInExtent(OperationContext* opCtx, - const DiskLoc& loc) const { - int prevOffset = recordFor(loc)->prevOfs(); - - if (prevOffset == DiskLoc::NullOfs) - return DiskLoc(); - - fassert(17442, abs(prevOffset) >= 8); // defensive - DiskLoc result(loc.a(), prevOffset); - return result; -} - -Status RecordStoreV1Base::insertRecordsWithDocWriter(OperationContext* opCtx, - const DocWriter* const* docs, - const Timestamp*, - size_t nDocs, - RecordId* idsOut) { - for (size_t i = 0; i < nDocs; i++) { - int docSize = docs[i]->documentSize(); - if (docSize < 4) { - return Status(ErrorCodes::InvalidLength, "record has to be >= 4 bytes"); - } - const int lenWHdr = docSize + MmapV1RecordHeader::HeaderSize; - if (lenWHdr > MaxAllowedAllocation) { - return Status(ErrorCodes::InvalidLength, "record has to be <= 16.5MB"); - } - const int lenToAlloc = (docs[i]->addPadding() && shouldPadInserts()) - ? quantizeAllocationSpace(lenWHdr) - : lenWHdr; - - StatusWith<DiskLoc> loc = allocRecord(opCtx, lenToAlloc, /*enforceQuota=*/false); - if (!loc.isOK()) - return loc.getStatus(); - - MmapV1RecordHeader* r = recordFor(loc.getValue()); - fassert(17319, r->lengthWithHeaders() >= lenWHdr); - - r = reinterpret_cast<MmapV1RecordHeader*>(opCtx->recoveryUnit()->writingPtr(r, lenWHdr)); - docs[i]->writeDocument(r->data()); - - _addRecordToRecListInExtent(opCtx, r, loc.getValue()); - - _details->incrementStats(opCtx, r->netLength(), 1); - - if (idsOut) - idsOut[i] = loc.getValue().toRecordId(); - } - - - return Status::OK(); -} - - -StatusWith<RecordId> RecordStoreV1Base::insertRecord( - OperationContext* opCtx, const char* data, int len, Timestamp, bool enforceQuota) { - if (len < 4) { - return StatusWith<RecordId>(ErrorCodes::InvalidLength, "record has to be >= 4 bytes"); - } - - if (len + MmapV1RecordHeader::HeaderSize > MaxAllowedAllocation) { - return StatusWith<RecordId>(ErrorCodes::InvalidLength, "record has to be <= 16.5MB"); - } - - return _insertRecord(opCtx, data, len, enforceQuota); -} - -StatusWith<RecordId> RecordStoreV1Base::_insertRecord(OperationContext* opCtx, - const char* data, - int len, - bool enforceQuota) { - const int lenWHdr = len + MmapV1RecordHeader::HeaderSize; - const int lenToAlloc = shouldPadInserts() ? quantizeAllocationSpace(lenWHdr) : lenWHdr; - fassert(17208, lenToAlloc >= lenWHdr); - - StatusWith<DiskLoc> loc = allocRecord(opCtx, lenToAlloc, enforceQuota); - if (!loc.isOK()) - return StatusWith<RecordId>(loc.getStatus()); - - MmapV1RecordHeader* r = recordFor(loc.getValue()); - fassert(17210, r->lengthWithHeaders() >= lenWHdr); - - // copy the data - r = reinterpret_cast<MmapV1RecordHeader*>(opCtx->recoveryUnit()->writingPtr(r, lenWHdr)); - memcpy(r->data(), data, len); - - _addRecordToRecListInExtent(opCtx, r, loc.getValue()); - - _details->incrementStats(opCtx, r->netLength(), 1); - - return StatusWith<RecordId>(loc.getValue().toRecordId()); -} - -Status RecordStoreV1Base::updateRecord(OperationContext* opCtx, - const RecordId& oldLocation, - const char* data, - int dataSize, - bool enforceQuota, - UpdateNotifier* notifier) { - MmapV1RecordHeader* oldRecord = recordFor(DiskLoc::fromRecordId(oldLocation)); - if (oldRecord->netLength() >= dataSize) { - // Make sure to notify other queries before we do an in-place update. - if (notifier) { - Status callbackStatus = notifier->recordStoreGoingToUpdateInPlace(opCtx, oldLocation); - if (!callbackStatus.isOK()) - return callbackStatus; - } - - // we fit - memcpy(opCtx->recoveryUnit()->writingPtr(oldRecord->data(), dataSize), data, dataSize); - return Status::OK(); - } - - // We enforce the restriction of unchanging capped doc sizes above the storage layer. - invariant(!isCapped()); - - return {ErrorCodes::NeedsDocumentMove, "Update requires document move"}; -} - -bool RecordStoreV1Base::updateWithDamagesSupported() const { - return true; -} - -StatusWith<RecordData> RecordStoreV1Base::updateWithDamages( - OperationContext* opCtx, - const RecordId& loc, - const RecordData& oldRec, - const char* damageSource, - const mutablebson::DamageVector& damages) { - MmapV1RecordHeader* rec = recordFor(DiskLoc::fromRecordId(loc)); - char* root = rec->data(); - - // All updates were in place. Apply them via durability and writing pointer. - mutablebson::DamageVector::const_iterator where = damages.begin(); - const mutablebson::DamageVector::const_iterator end = damages.end(); - for (; where != end; ++where) { - const char* sourcePtr = damageSource + where->sourceOffset; - void* targetPtr = - opCtx->recoveryUnit()->writingPtr(root + where->targetOffset, where->size); - std::memcpy(targetPtr, sourcePtr, where->size); - } - - return rec->toRecordData(); -} - -void RecordStoreV1Base::deleteRecord(OperationContext* opCtx, const RecordId& rid) { - const DiskLoc dl = DiskLoc::fromRecordId(rid); - - MmapV1RecordHeader* todelete = recordFor(dl); - invariant(todelete->netLength() >= 4); // this is required for defensive code - - /* remove ourself from the record next/prev chain */ - { - if (todelete->prevOfs() != DiskLoc::NullOfs) { - DiskLoc prev = getPrevRecordInExtent(opCtx, dl); - MmapV1RecordHeader* prevRecord = recordFor(prev); - opCtx->recoveryUnit()->writingInt(prevRecord->nextOfs()) = todelete->nextOfs(); - } - - if (todelete->nextOfs() != DiskLoc::NullOfs) { - DiskLoc next = getNextRecord(opCtx, dl); - MmapV1RecordHeader* nextRecord = recordFor(next); - opCtx->recoveryUnit()->writingInt(nextRecord->prevOfs()) = todelete->prevOfs(); - } - } - - /* remove ourself from extent pointers */ - { - DiskLoc extentLoc = todelete->myExtentLoc(dl); - Extent* e = _getExtent(opCtx, extentLoc); - if (e->firstRecord == dl) { - opCtx->recoveryUnit()->writing(&e->firstRecord); - if (todelete->nextOfs() == DiskLoc::NullOfs) - e->firstRecord.Null(); - else - e->firstRecord.set(dl.a(), todelete->nextOfs()); - } - if (e->lastRecord == dl) { - opCtx->recoveryUnit()->writing(&e->lastRecord); - if (todelete->prevOfs() == DiskLoc::NullOfs) - e->lastRecord.Null(); - else - e->lastRecord.set(dl.a(), todelete->prevOfs()); - } - } - - /* add to the free list */ - { - _details->incrementStats(opCtx, -1 * todelete->netLength(), -1); - - if (_isSystemIndexes) { - /* temp: if in system.indexes, don't reuse, and zero out: we want to be - careful until validated more, as IndexDetails has pointers - to this disk location. so an incorrectly done remove would cause - a lot of problems. - */ - memset(opCtx->recoveryUnit()->writingPtr(todelete, todelete->lengthWithHeaders()), - 0, - todelete->lengthWithHeaders()); - } else { - // this is defensive so we can detect if we are still using a location - // that was deleted - memset(opCtx->recoveryUnit()->writingPtr(todelete->data(), 4), 0xee, 4); - addDeletedRec(opCtx, dl); - } - } -} - -std::unique_ptr<RecordCursor> RecordStoreV1Base::getCursorForRepair(OperationContext* opCtx) const { - return stdx::make_unique<RecordStoreV1RepairCursor>(opCtx, this); -} - -void RecordStoreV1Base::_addRecordToRecListInExtent(OperationContext* opCtx, - MmapV1RecordHeader* r, - DiskLoc loc) { - dassert(recordFor(loc) == r); - DiskLoc extentLoc = _getExtentLocForRecord(opCtx, loc); - Extent* e = _getExtent(opCtx, extentLoc); - if (e->lastRecord.isNull()) { - *opCtx->recoveryUnit()->writing(&e->firstRecord) = loc; - *opCtx->recoveryUnit()->writing(&e->lastRecord) = loc; - r->prevOfs() = r->nextOfs() = DiskLoc::NullOfs; - } else { - MmapV1RecordHeader* oldlast = recordFor(e->lastRecord); - r->prevOfs() = e->lastRecord.getOfs(); - r->nextOfs() = DiskLoc::NullOfs; - opCtx->recoveryUnit()->writingInt(oldlast->nextOfs()) = loc.getOfs(); - *opCtx->recoveryUnit()->writing(&e->lastRecord) = loc; - } -} - -void RecordStoreV1Base::increaseStorageSize(OperationContext* opCtx, int size, bool enforceQuota) { - DiskLoc eloc = _extentManager->allocateExtent(opCtx, isCapped(), size, enforceQuota); - Extent* e = _extentManager->getExtent(eloc); - invariant(e); - - *opCtx->recoveryUnit()->writing(&e->nsDiagnostic) = _ns; - - opCtx->recoveryUnit()->writing(&e->xnext)->Null(); - opCtx->recoveryUnit()->writing(&e->xprev)->Null(); - opCtx->recoveryUnit()->writing(&e->firstRecord)->Null(); - opCtx->recoveryUnit()->writing(&e->lastRecord)->Null(); - - DiskLoc emptyLoc = _findFirstSpot(opCtx, eloc, e); - - if (_details->lastExtent(opCtx).isNull()) { - invariant(_details->firstExtent(opCtx).isNull()); - _details->setFirstExtent(opCtx, eloc); - _details->setLastExtent(opCtx, eloc); - _details->setCapExtent(opCtx, eloc); - invariant(e->xprev.isNull()); - invariant(e->xnext.isNull()); - } else { - invariant(!_details->firstExtent(opCtx).isNull()); - *opCtx->recoveryUnit()->writing(&e->xprev) = _details->lastExtent(opCtx); - *opCtx->recoveryUnit()->writing( - &_extentManager->getExtent(_details->lastExtent(opCtx))->xnext) = eloc; - _details->setLastExtent(opCtx, eloc); - } - - _details->setLastExtentSize(opCtx, e->length); - - addDeletedRec(opCtx, emptyLoc); -} - -Status RecordStoreV1Base::validate(OperationContext* opCtx, - ValidateCmdLevel level, - ValidateAdaptor* adaptor, - ValidateResults* results, - BSONObjBuilder* output) { - // 1) basic status that require no iteration - // 2) extent level info - // 3) check extent start and end - // 4) check each non-deleted record - // 5) check deleted list - - // ------------- - - // 1111111111111111111 - if (isCapped()) { - output->appendBool("capped", true); - output->appendNumber("max", _details->maxCappedDocs()); - } - - output->appendNumber("datasize", _details->dataSize()); - output->appendNumber("nrecords", _details->numRecords()); - output->appendNumber("lastExtentSize", _details->lastExtentSize(opCtx)); - - if (_details->firstExtent(opCtx).isNull()) - output->append("firstExtent", "null"); - else - output->append("firstExtent", - str::stream() << _details->firstExtent(opCtx).toString() << " ns:" - << _getExtent(opCtx, _details->firstExtent(opCtx)) - ->nsDiagnostic.toString()); - if (_details->lastExtent(opCtx).isNull()) - output->append("lastExtent", "null"); - else - output->append("lastExtent", - str::stream() << _details->lastExtent(opCtx).toString() << " ns:" - << _getExtent(opCtx, _details->lastExtent(opCtx)) - ->nsDiagnostic.toString()); - - // 22222222222222222222222222 - { // validate extent basics - BSONArrayBuilder extentData; - int extentCount = 0; - DiskLoc extentDiskLoc; - try { - if (!_details->firstExtent(opCtx).isNull()) { - _getExtent(opCtx, _details->firstExtent(opCtx))->assertOk(); - _getExtent(opCtx, _details->lastExtent(opCtx))->assertOk(); - } - - extentDiskLoc = _details->firstExtent(opCtx); - while (!extentDiskLoc.isNull()) { - Extent* thisExtent = _getExtent(opCtx, extentDiskLoc); - if (level == kValidateFull) { - extentData << thisExtent->dump(); - } - if (!thisExtent->validates(extentDiskLoc, &results->errors)) { - results->valid = false; - } - DiskLoc nextDiskLoc = thisExtent->xnext; - - if (extentCount > 0 && !nextDiskLoc.isNull() && - _getExtent(opCtx, nextDiskLoc)->xprev != extentDiskLoc) { - StringBuilder sb; - sb << "'xprev' pointer " << _getExtent(opCtx, nextDiskLoc)->xprev.toString() - << " in extent " << nextDiskLoc.toString() << " does not point to extent " - << extentDiskLoc.toString(); - results->errors.push_back(sb.str()); - results->valid = false; - } - if (nextDiskLoc.isNull() && extentDiskLoc != _details->lastExtent(opCtx)) { - StringBuilder sb; - sb << "'lastExtent' pointer " << _details->lastExtent(opCtx).toString() - << " does not point to last extent in list " << extentDiskLoc.toString(); - results->errors.push_back(sb.str()); - results->valid = false; - } - extentDiskLoc = nextDiskLoc; - extentCount++; - opCtx->checkForInterrupt(); - } - } catch (const DBException& e) { - StringBuilder sb; - sb << "exception validating extent " << extentCount << ": " << e.what(); - results->errors.push_back(sb.str()); - results->valid = false; - return Status::OK(); - } - output->append("extentCount", extentCount); - - if (level == kValidateFull) - output->appendArray("extents", extentData.arr()); - } - - try { - // 333333333333333333333333333 - bool testingLastExtent = false; - try { - DiskLoc firstExtentLoc = _details->firstExtent(opCtx); - if (firstExtentLoc.isNull()) { - // this is ok - } else { - output->append("firstExtentDetails", _getExtent(opCtx, firstExtentLoc)->dump()); - if (!_getExtent(opCtx, firstExtentLoc)->xprev.isNull()) { - StringBuilder sb; - sb << "'xprev' pointer in 'firstExtent' " - << _details->firstExtent(opCtx).toString() << " is " - << _getExtent(opCtx, firstExtentLoc)->xprev.toString() << ", should be null"; - results->errors.push_back(sb.str()); - results->valid = false; - } - } - testingLastExtent = true; - DiskLoc lastExtentLoc = _details->lastExtent(opCtx); - if (lastExtentLoc.isNull()) { - // this is ok - } else { - if (firstExtentLoc != lastExtentLoc) { - output->append("lastExtentDetails", _getExtent(opCtx, lastExtentLoc)->dump()); - if (!_getExtent(opCtx, lastExtentLoc)->xnext.isNull()) { - StringBuilder sb; - sb << "'xnext' pointer in 'lastExtent' " << lastExtentLoc.toString() - << " is " << _getExtent(opCtx, lastExtentLoc)->xnext.toString() - << ", should be null"; - results->errors.push_back(sb.str()); - results->valid = false; - } - } - } - } catch (const DBException& e) { - StringBuilder sb; - sb << "exception processing '" << (testingLastExtent ? "lastExtent" : "firstExtent") - << "': " << e.what(); - results->errors.push_back(sb.str()); - results->valid = false; - } - - // 4444444444444444444444444 - - set<DiskLoc> recs; - int n = 0; - int nInvalid = 0; - long long nQuantizedSize = 0; - long long len = 0; - long long nlen = 0; - long long bsonLen = 0; - int outOfOrder = 0; - DiskLoc dl_last; - - auto cursor = getCursor(opCtx); - while (auto record = cursor->next()) { - const auto dl = DiskLoc::fromRecordId(record->id); - n++; - - if (n < 1000000 && level == kValidateFull) - recs.insert(dl); - if (isCapped()) { - if (dl < dl_last) - outOfOrder++; - dl_last = dl; - } - - MmapV1RecordHeader* r = recordFor(dl); - len += r->lengthWithHeaders(); - nlen += r->netLength(); - - if (isQuantized(r->lengthWithHeaders())) { - // Count the number of records having a size consistent with - // the quantizeAllocationSpace quantization implementation. - ++nQuantizedSize; - } - - size_t dataSize = 0; - const Status status = adaptor->validate(record->id, r->toRecordData(), &dataSize); - if (!status.isOK()) { - results->valid = false; - if (nInvalid == 0) // only log once; - results->errors.push_back("invalid object detected (see logs)"); - - nInvalid++; - log() << "Invalid object detected in " << _ns << ": " << redact(status); - } else { - bsonLen += dataSize; - } - } - - if (isCapped() && !_details->capLooped()) { - output->append("cappedOutOfOrder", outOfOrder); - if (outOfOrder > 1) { - results->valid = false; - results->errors.push_back("too many out of order records"); - } - } - output->append("objectsFound", n); - output->append("invalidObjects", nInvalid); - output->appendNumber("nQuantizedSize", nQuantizedSize); - output->appendNumber("bytesWithHeaders", len); - output->appendNumber("bytesWithoutHeaders", nlen); - - if (level == kValidateFull) { - output->appendNumber("bytesBson", bsonLen); - } // end scanData - - // 55555555555555555555555555 - - if (level == kValidateFull) { - BSONArrayBuilder deletedListArray; - for (int i = 0; i < Buckets; i++) { - deletedListArray << _details->deletedListEntry(i).isNull(); - } - - int ndel = 0; - long long delSize = 0; - BSONArrayBuilder delBucketSizes; - int incorrect = 0; - for (int i = 0; i < Buckets; i++) { - DiskLoc loc = _details->deletedListEntry(i); - try { - int k = 0; - while (!loc.isNull()) { - if (recs.count(loc)) - incorrect++; - ndel++; - - if (loc.questionable()) { - if (isCapped() && !loc.isValid() && i == 1) { - /* the constructor for NamespaceDetails intentionally sets - * deletedList[1] to invalid see comments in namespace.h - */ - break; - } - - string err(str::stream() << "bad pointer in deleted record list: " - << loc.toString() - << " bucket: " - << i - << " k: " - << k); - results->errors.push_back(err); - results->valid = false; - break; - } - - const DeletedRecord* d = deletedRecordFor(loc); - delSize += d->lengthWithHeaders(); - loc = d->nextDeleted(); - k++; - opCtx->checkForInterrupt(); - } - delBucketSizes << k; - } catch (...) { - results->errors.push_back((string) "exception in deleted chain for bucket " + - BSONObjBuilder::numStr(i)); - results->valid = false; - } - } - - output->appendNumber("deletedCount", ndel); - output->appendNumber("deletedSize", delSize); - output->append("delBucketSizes", delBucketSizes.arr()); - - if (incorrect) { - results->errors.push_back(BSONObjBuilder::numStr(incorrect) + - " records from datafile are in deleted list"); - results->valid = false; - } - } - - } catch (const AssertionException& e) { - StringBuilder sb; - sb << "exception during validate: " << e.what(); - results->errors.push_back(sb.str()); - results->valid = false; - } - - return Status::OK(); -} - -void RecordStoreV1Base::appendCustomStats(OperationContext* opCtx, - BSONObjBuilder* result, - double scale) const { - result->append("lastExtentSize", _details->lastExtentSize(opCtx) / scale); - result->append("paddingFactor", 1.0); // hard coded - result->append("paddingFactorNote", - "paddingFactor is unused and unmaintained in 3.0. It " - "remains hard coded to 1.0 for compatibility only."); - result->append("userFlags", _details->userFlags()); - result->appendBool("capped", isCapped()); - if (isCapped()) { - result->appendNumber("max", _details->maxCappedDocs()); - result->appendNumber("maxSize", - static_cast<long long>(storageSize(opCtx, NULL, 0) / scale)); - } -} - - -namespace { -struct touch_location { - const char* root; - size_t length; -}; -} - -Status RecordStoreV1Base::touch(OperationContext* opCtx, BSONObjBuilder* output) const { - Timer t; - - std::vector<touch_location> ranges; - { - DiskLoc nextLoc = _details->firstExtent(opCtx); - Extent* ext = nextLoc.isNull() ? NULL : _getExtent(opCtx, nextLoc); - while (ext) { - touch_location tl; - tl.root = reinterpret_cast<const char*>(ext); - tl.length = ext->length; - ranges.push_back(tl); - - nextLoc = ext->xnext; - if (nextLoc.isNull()) - ext = NULL; - else - ext = _getExtent(opCtx, nextLoc); - } - } - - std::string progress_msg = "touch " + ns() + " extents"; - stdx::unique_lock<Client> lk(*opCtx->getClient()); - ProgressMeterHolder pm(CurOp::get(opCtx)->setMessage_inlock( - progress_msg.c_str(), "Touch Progress", ranges.size())); - lk.unlock(); - - for (std::vector<touch_location>::iterator it = ranges.begin(); it != ranges.end(); ++it) { - touch_pages(it->root, it->length); - pm.hit(); - opCtx->checkForInterrupt(); - } - pm.finished(); - - if (output) { - output->append("numRanges", static_cast<int>(ranges.size())); - output->append("millis", t.millis()); - } - - return Status::OK(); -} - -boost::optional<Record> RecordStoreV1Base::IntraExtentIterator::next() { - if (_curr.isNull()) - return {}; - auto out = _curr.toRecordId(); - advance(); - return {{out, _rs->dataFor(_opCtx, out)}}; -} - -void RecordStoreV1Base::IntraExtentIterator::advance() { - if (_curr.isNull()) - return; - - const MmapV1RecordHeader* rec = recordFor(_curr); - const int nextOfs = _forward ? rec->nextOfs() : rec->prevOfs(); - _curr = (nextOfs == DiskLoc::NullOfs ? DiskLoc() : DiskLoc(_curr.a(), nextOfs)); -} - -void RecordStoreV1Base::IntraExtentIterator::invalidate(OperationContext* opCtx, - const RecordId& rid) { - if (rid == _curr.toRecordId()) { - const DiskLoc origLoc = _curr; - - // Undo the advance on rollback, as the deletion that forced it "never happened". - opCtx->recoveryUnit()->onRollback([this, origLoc]() { this->_curr = origLoc; }); - advance(); - } -} - -std::unique_ptr<RecordFetcher> RecordStoreV1Base::IntraExtentIterator::fetcherForNext() const { - return _rs->_extentManager->recordNeedsFetch(_curr); -} - -int RecordStoreV1Base::quantizeAllocationSpace(int allocSize) { - invariant(allocSize <= MaxAllowedAllocation); - for (int i = 0; i < Buckets - 2; i++) { // last two bucketSizes are invalid - if (bucketSizes[i] >= allocSize) { - // Return the size of the first bucket sized >= the requested size. - return bucketSizes[i]; - } - } - MONGO_UNREACHABLE; // prior invariant means we should find something. -} - -bool RecordStoreV1Base::isQuantized(int recordSize) { - if (recordSize > MaxAllowedAllocation) - return false; - - return recordSize == quantizeAllocationSpace(recordSize); -} - -int RecordStoreV1Base::bucket(int size) { - for (int i = 0; i < Buckets; i++) { - if (bucketSizes[i] > size) { - // Return the first bucket sized _larger_ than the requested size. This is important - // since we want all records in a bucket to be >= the quantized size, therefore the - // quantized size must be the smallest allowed record per bucket. - return i; - } - } - // Technically, this is reachable if size == INT_MAX, but it would be an error to pass that - // in anyway since it would be impossible to have a record that large given the file and - // extent headers. - MONGO_UNREACHABLE; -} -} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_base.h b/src/mongo/db/storage/mmap_v1/record_store_v1_base.h deleted file mode 100644 index 7e21228fbb9..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_store_v1_base.h +++ /dev/null @@ -1,364 +0,0 @@ -/** -* Copyright (C) 2013-2014 MongoDB Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include "mongo/stdx/unordered_set.h" -#include "mongo/util/concurrency/spin_lock.h" - -#include "mongo/db/storage/mmap_v1/diskloc.h" -#include "mongo/db/storage/record_store.h" - -namespace mongo { - -class DeletedRecord; -class ExtentManager; -class MmapV1RecordHeader; -class OperationContext; - -struct Extent; - -class RecordStoreV1MetaData { -public: - virtual ~RecordStoreV1MetaData() {} - - virtual const DiskLoc& capExtent() const = 0; - virtual void setCapExtent(OperationContext* opCtx, const DiskLoc& loc) = 0; - - virtual const DiskLoc& capFirstNewRecord() const = 0; - virtual void setCapFirstNewRecord(OperationContext* opCtx, const DiskLoc& loc) = 0; - - bool capLooped() const { - return capFirstNewRecord().isValid(); - } - - virtual long long dataSize() const = 0; - virtual long long numRecords() const = 0; - - virtual void incrementStats(OperationContext* opCtx, - long long dataSizeIncrement, - long long numRecordsIncrement) = 0; - - virtual void setStats(OperationContext* opCtx, long long dataSize, long long numRecords) = 0; - - virtual DiskLoc deletedListEntry(int bucket) const = 0; - virtual void setDeletedListEntry(OperationContext* opCtx, int bucket, const DiskLoc& loc) = 0; - - virtual DiskLoc deletedListLegacyGrabBag() const = 0; - virtual void setDeletedListLegacyGrabBag(OperationContext* opCtx, const DiskLoc& loc) = 0; - - virtual void orphanDeletedList(OperationContext* opCtx) = 0; - - virtual const DiskLoc& firstExtent(OperationContext* opCtx) const = 0; - virtual void setFirstExtent(OperationContext* opCtx, const DiskLoc& loc) = 0; - - virtual const DiskLoc& lastExtent(OperationContext* opCtx) const = 0; - virtual void setLastExtent(OperationContext* opCtx, const DiskLoc& loc) = 0; - - virtual bool isCapped() const = 0; - - virtual bool isUserFlagSet(int flag) const = 0; - virtual int userFlags() const = 0; - virtual bool setUserFlag(OperationContext* opCtx, int flag) = 0; - virtual bool clearUserFlag(OperationContext* opCtx, int flag) = 0; - virtual bool replaceUserFlags(OperationContext* opCtx, int flags) = 0; - - virtual int lastExtentSize(OperationContext* opCtx) const = 0; - virtual void setLastExtentSize(OperationContext* opCtx, int newMax) = 0; - - virtual long long maxCappedDocs() const = 0; -}; - -/** - * Class that stores active cursors that have been saved (as part of yielding) to - * allow them to be invalidated if the thing they pointed at goes away. The registry is - * thread-safe, as readers may concurrently register and remove their cursors. Contention is - * expected to be very low, as yielding is infrequent. This logically belongs to the - * RecordStore, but is not contained in it to facilitate unit testing. - */ -class SavedCursorRegistry { -public: - /** - * The destructor ensures the cursor is unregistered when an exception is thrown. - * Note that the SavedCursor may outlive the registry it was saved in. - */ - struct SavedCursor { - SavedCursor() : _registry(NULL) {} - virtual ~SavedCursor() { - if (_registry) - _registry->unregisterCursor(this); - } - DiskLoc bucket; - BSONObj key; - DiskLoc loc; - - private: - friend class SavedCursorRegistry; - // Non-null iff registered. Accessed by owner or writer with MODE_X collection lock - SavedCursorRegistry* _registry; - }; - - ~SavedCursorRegistry(); - - /** - * Adds given saved cursor to SavedCursorRegistry. Doesn't take ownership. - */ - void registerCursor(SavedCursor* cursor); - - /** - * Removes given saved cursor. Returns true if the cursor was still present, and false - * if it had already been removed due to invalidation. Doesn't take ownership. - */ - bool unregisterCursor(SavedCursor* cursor); - - /** - * When a btree-bucket disappears due to merge/split or similar, this invalidates all - * cursors that point at the same bucket by removing them from the registry. - */ - void invalidateCursorsForBucket(DiskLoc bucket); - -private: - SpinLock _mutex; - typedef stdx::unordered_set<SavedCursor*> - SavedCursorSet; // SavedCursor pointers not owned here - SavedCursorSet _cursors; -}; - -class RecordStoreV1Base : public RecordStore { -public: - static const int Buckets = 26; - static const int MaxAllowedAllocation = 16 * 1024 * 1024 + 512 * 1024; - - static const int bucketSizes[]; - - // ------------ - - class IntraExtentIterator; - - /** - * @param details - takes ownership - * @param em - does NOT take ownership - */ - RecordStoreV1Base(StringData ns, - RecordStoreV1MetaData* details, - ExtentManager* em, - bool isSystemIndexes); - - virtual ~RecordStoreV1Base(); - - const std::string& getIdent() const override { - MONGO_UNREACHABLE; - } - - virtual long long dataSize(OperationContext* opCtx) const { - return _details->dataSize(); - } - virtual long long numRecords(OperationContext* opCtx) const { - return _details->numRecords(); - } - - virtual int64_t storageSize(OperationContext* opCtx, - BSONObjBuilder* extraInfo = NULL, - int level = 0) const; - - virtual RecordData dataFor(OperationContext* opCtx, const RecordId& loc) const; - - virtual bool findRecord(OperationContext* opCtx, const RecordId& loc, RecordData* rd) const; - - void deleteRecord(OperationContext* opCtx, const RecordId& dl); - - StatusWith<RecordId> insertRecord( - OperationContext* opCtx, const char* data, int len, Timestamp, bool enforceQuota); - - Status insertRecordsWithDocWriter(OperationContext* opCtx, - const DocWriter* const* docs, - const Timestamp*, - size_t nDocs, - RecordId* idsOut) final; - - virtual Status updateRecord(OperationContext* opCtx, - const RecordId& oldLocation, - const char* data, - int len, - bool enforceQuota, - UpdateNotifier* notifier); - - virtual bool updateWithDamagesSupported() const; - - virtual StatusWith<RecordData> updateWithDamages(OperationContext* opCtx, - const RecordId& loc, - const RecordData& oldRec, - const char* damageSource, - const mutablebson::DamageVector& damages); - - virtual std::unique_ptr<RecordCursor> getCursorForRepair(OperationContext* opCtx) const; - - void increaseStorageSize(OperationContext* opCtx, int size, bool enforceQuota); - - virtual Status validate(OperationContext* opCtx, - ValidateCmdLevel level, - ValidateAdaptor* adaptor, - ValidateResults* results, - BSONObjBuilder* output); - - virtual void appendCustomStats(OperationContext* opCtx, - BSONObjBuilder* result, - double scale) const; - - virtual Status touch(OperationContext* opCtx, BSONObjBuilder* output) const; - - const RecordStoreV1MetaData* details() const { - return _details.get(); - } - - // This keeps track of cursors saved during yielding, for invalidation purposes. - SavedCursorRegistry savedCursors; - - DiskLoc getExtentLocForRecord(OperationContext* opCtx, const DiskLoc& loc) const; - - DiskLoc getNextRecord(OperationContext* opCtx, const DiskLoc& loc) const; - DiskLoc getPrevRecord(OperationContext* opCtx, const DiskLoc& loc) const; - - DiskLoc getNextRecordInExtent(OperationContext* opCtx, const DiskLoc& loc) const; - DiskLoc getPrevRecordInExtent(OperationContext* opCtx, const DiskLoc& loc) const; - - /** - * Quantize 'minSize' to the nearest allocation size. - */ - static int quantizeAllocationSpace(int minSize); - - static bool isQuantized(int recordSize); - - /* return which "deleted bucket" for this size object */ - static int bucket(int size); - - void waitForAllEarlierOplogWritesToBeVisible(OperationContext* opCtx) const override {} - - virtual void updateStatsAfterRepair(OperationContext* opCtx, - long long numRecords, - long long dataSize) { - MONGO_UNREACHABLE; // MMAPv1 has its own repair which doesn't call this. - } - -protected: - virtual MmapV1RecordHeader* recordFor(const DiskLoc& loc) const; - - const DeletedRecord* deletedRecordFor(const DiskLoc& loc) const; - - virtual bool isCapped() const = 0; - - virtual bool shouldPadInserts() const = 0; - - virtual StatusWith<DiskLoc> allocRecord(OperationContext* opCtx, - int lengthWithHeaders, - bool enforceQuota) = 0; - - // TODO: document, remove, what have you - virtual void addDeletedRec(OperationContext* opCtx, const DiskLoc& dloc) = 0; - - // TODO: another sad one - virtual DeletedRecord* drec(const DiskLoc& loc) const; - - // just a wrapper for _extentManager->getExtent( loc ); - Extent* _getExtent(OperationContext* opCtx, const DiskLoc& loc) const; - - DiskLoc _getExtentLocForRecord(OperationContext* opCtx, const DiskLoc& loc) const; - - DiskLoc _getNextRecord(OperationContext* opCtx, const DiskLoc& loc) const; - DiskLoc _getPrevRecord(OperationContext* opCtx, const DiskLoc& loc) const; - - DiskLoc _getNextRecordInExtent(OperationContext* opCtx, const DiskLoc& loc) const; - DiskLoc _getPrevRecordInExtent(OperationContext* opCtx, const DiskLoc& loc) const; - - /** - * finds the first suitable DiskLoc for data - * will return the DiskLoc of a newly created DeletedRecord - */ - DiskLoc _findFirstSpot(OperationContext* opCtx, const DiskLoc& extDiskLoc, Extent* e); - - /** add a record to the end of the linked list chain within this extent. - require: you must have already declared write intent for the record header. - */ - void _addRecordToRecListInExtent(OperationContext* opCtx, MmapV1RecordHeader* r, DiskLoc loc); - - /** - * internal - * doesn't check inputs or change padding - */ - StatusWith<RecordId> _insertRecord(OperationContext* opCtx, - const char* data, - int len, - bool enforceQuota); - - std::unique_ptr<RecordStoreV1MetaData> _details; - ExtentManager* _extentManager; - bool _isSystemIndexes; - - friend class RecordStoreV1RepairCursor; -}; - -/** - * Iterates over all records within a single extent. - * - * EOF at end of extent, even if there are more extents. - */ -class RecordStoreV1Base::IntraExtentIterator final : public RecordCursor { -public: - IntraExtentIterator(OperationContext* opCtx, - DiskLoc start, - const RecordStoreV1Base* rs, - bool forward = true) - : _opCtx(opCtx), _curr(start), _rs(rs), _forward(forward) {} - - boost::optional<Record> next() final; - void invalidate(OperationContext* opCtx, const RecordId& dl) final; - void save() final {} - bool restore() final { - return true; - } - void detachFromOperationContext() final { - _opCtx = nullptr; - } - void reattachToOperationContext(OperationContext* opCtx) final { - _opCtx = opCtx; - } - std::unique_ptr<RecordFetcher> fetcherForNext() const final; - -private: - virtual const MmapV1RecordHeader* recordFor(const DiskLoc& loc) const { - return _rs->recordFor(loc); - } - - void advance(); - - OperationContext* _opCtx; - DiskLoc _curr; - const RecordStoreV1Base* _rs; - bool _forward; -}; -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp deleted file mode 100644 index 6a3c02b562f..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_store_v1_capped.cpp +++ /dev/null @@ -1,696 +0,0 @@ -// record_store_v1_capped.cpp - -/** - * Copyright (C) 2013 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h" - -#include "mongo/db/client.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/storage/mmap_v1/extent.h" -#include "mongo/db/storage/mmap_v1/extent_manager.h" -#include "mongo/db/storage/mmap_v1/mmap.h" -#include "mongo/db/storage/mmap_v1/record.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h" -#include "mongo/stdx/memory.h" -#include "mongo/util/log.h" -#include "mongo/util/mongoutils/str.h" - -/* - capped collection layout - - d's below won't exist if things align perfectly: - - extent1 -> extent2 -> extent3 - ------------------- ----------------------- --------------------- - d r r r r r r r r d d r r r r d r r r r r d d r r r r r r r r r d - ^ ^ - oldest newest - - ^cappedFirstDeletedInCurExtent() - ^cappedLastDelRecLastExtent() - ^cappedListOfAllDeletedRecords() -*/ - -#define DDD(x) - -namespace mongo { - -using std::dec; -using std::endl; -using std::hex; -using std::vector; - -CappedRecordStoreV1::CappedRecordStoreV1(OperationContext* opCtx, - CappedCallback* collection, - StringData ns, - RecordStoreV1MetaData* details, - ExtentManager* em, - bool isSystemIndexes) - : RecordStoreV1Base(ns, details, em, isSystemIndexes), _cappedCallback(collection) { - DiskLoc extentLoc = details->firstExtent(opCtx); - while (!extentLoc.isNull()) { - _extentAdvice.push_back(_extentManager->cacheHint(extentLoc, ExtentManager::Sequential)); - Extent* extent = em->getExtent(extentLoc); - extentLoc = extent->xnext; - } - - // this is for VERY VERY old versions of capped collections - cappedCheckMigrate(opCtx); -} - -CappedRecordStoreV1::~CappedRecordStoreV1() {} - -StatusWith<DiskLoc> CappedRecordStoreV1::allocRecord(OperationContext* opCtx, - int lenToAlloc, - bool enforceQuota) { - { - // align very slightly. - lenToAlloc = (lenToAlloc + 3) & 0xfffffffc; - } - - if (lenToAlloc > theCapExtent()->length) { - // the extent check is a way to try and improve performance - // since we have to iterate all the extents (for now) to get - // storage size - if (lenToAlloc > storageSize(opCtx)) { - return StatusWith<DiskLoc>( - ErrorCodes::DocTooLargeForCapped, - mongoutils::str::stream() << "document is larger than capped size " << lenToAlloc - << " > " - << storageSize(opCtx)); - } - } - DiskLoc loc; - { // do allocation - - // signal done allocating new extents. - if (!cappedLastDelRecLastExtent().isValid()) - setLastDelRecLastExtent(opCtx, DiskLoc()); - - invariant(lenToAlloc < 400000000); - int passes = 0; - - // delete records until we have room and the max # objects limit achieved. - - /* this fails on a rename -- that is ok but must keep commented out */ - // invariant( theCapExtent()->ns == ns ); - - theCapExtent()->assertOk(); - DiskLoc firstEmptyExtent; // This prevents us from infinite looping. - while (1) { - if (_details->numRecords() < _details->maxCappedDocs()) { - loc = __capAlloc(opCtx, lenToAlloc); - if (!loc.isNull()) - break; - } - - // If on first iteration through extents, don't delete anything. - if (!_details->capFirstNewRecord().isValid()) { - advanceCapExtent(opCtx, _ns); - - if (_details->capExtent() != _details->firstExtent(opCtx)) - _details->setCapFirstNewRecord(opCtx, DiskLoc().setInvalid()); - // else signal done with first iteration through extents. - continue; - } - - if (!_details->capFirstNewRecord().isNull() && - theCapExtent()->firstRecord == _details->capFirstNewRecord()) { - // We've deleted all records that were allocated on the previous - // iteration through this extent. - advanceCapExtent(opCtx, _ns); - continue; - } - - if (theCapExtent()->firstRecord.isNull()) { - if (firstEmptyExtent.isNull()) - firstEmptyExtent = _details->capExtent(); - advanceCapExtent(opCtx, _ns); - if (firstEmptyExtent == _details->capExtent()) { - // All records have been deleted but there is still no room for this record. - // Nothing we can do but fail. - _maybeComplain(opCtx, lenToAlloc); - return StatusWith<DiskLoc>(ErrorCodes::DocTooLargeForCapped, - str::stream() - << "document doesn't fit in capped collection." - << " size: " - << lenToAlloc - << " storageSize:" - << storageSize(opCtx)); - } - continue; - } - - const RecordId fr = theCapExtent()->firstRecord.toRecordId(); - Status status = _cappedCallback->aboutToDeleteCapped(opCtx, fr, dataFor(opCtx, fr)); - if (!status.isOK()) - return StatusWith<DiskLoc>(status); - deleteRecord(opCtx, fr); - - _compact(opCtx); - if ((++passes % 5000) == 0) { - StringBuilder sb; - log() << "passes = " << passes << " in CappedRecordStoreV1::allocRecord:" - << " ns: " << _ns << ", lenToAlloc: " << lenToAlloc - << ", maxCappedDocs: " << _details->maxCappedDocs() - << ", nrecords: " << _details->numRecords() - << ", datasize: " << _details->dataSize() - << ". Continuing to delete old records to make room."; - } - } - - // Remember first record allocated on this iteration through capExtent. - if (_details->capFirstNewRecord().isValid() && _details->capFirstNewRecord().isNull()) - _details->setCapFirstNewRecord(opCtx, loc); - } - - invariant(!loc.isNull()); - - // possibly slice up if we've allocated too much space - - DeletedRecord* r = drec(loc); - - /* note we want to grab from the front so our next pointers on disk tend - to go in a forward direction which is important for performance. */ - int regionlen = r->lengthWithHeaders(); - invariant(r->extentOfs() < loc.getOfs()); - - int left = regionlen - lenToAlloc; - - /* split off some for further use. */ - opCtx->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc; - DiskLoc newDelLoc = loc; - newDelLoc.inc(lenToAlloc); - DeletedRecord* newDel = drec(newDelLoc); - DeletedRecord* newDelW = opCtx->recoveryUnit()->writing(newDel); - newDelW->extentOfs() = r->extentOfs(); - newDelW->lengthWithHeaders() = left; - newDelW->nextDeleted().Null(); - - addDeletedRec(opCtx, newDelLoc); - - return StatusWith<DiskLoc>(loc); -} - -Status CappedRecordStoreV1::truncate(OperationContext* opCtx) { - setLastDelRecLastExtent(opCtx, DiskLoc()); - setListOfAllDeletedRecords(opCtx, DiskLoc()); - - // preserve firstExtent/lastExtent - _details->setCapExtent(opCtx, _details->firstExtent(opCtx)); - _details->setStats(opCtx, 0, 0); - // preserve lastExtentSize - // nIndexes preserve 0 - // capped preserve true - // max preserve - // paddingFactor is unused - _details->setCapFirstNewRecord(opCtx, DiskLoc().setInvalid()); - setLastDelRecLastExtent(opCtx, DiskLoc().setInvalid()); - // dataFileVersion preserve - // indexFileVersion preserve - - // Reset all existing extents and recreate the deleted list. - Extent* ext; - for (DiskLoc extLoc = _details->firstExtent(opCtx); !extLoc.isNull(); extLoc = ext->xnext) { - ext = _extentManager->getExtent(extLoc); - - opCtx->recoveryUnit()->writing(&ext->firstRecord)->Null(); - opCtx->recoveryUnit()->writing(&ext->lastRecord)->Null(); - - addDeletedRec(opCtx, _findFirstSpot(opCtx, extLoc, ext)); - } - - return Status::OK(); -} - -void CappedRecordStoreV1::cappedTruncateAfter(OperationContext* opCtx, - RecordId end, - bool inclusive) { - cappedTruncateAfter(opCtx, _ns.c_str(), DiskLoc::fromRecordId(end), inclusive); -} - -/* combine adjacent deleted records *for the current extent* of the capped collection - - this is O(n^2) but we call it for capped tables where typically n==1 or 2! - (or 3...there will be a little unused sliver at the end of the extent.) -*/ -void CappedRecordStoreV1::_compact(OperationContext* opCtx) { - DDD("CappedRecordStoreV1::compact enter"); - - vector<DiskLoc> drecs; - - // Pull out capExtent's DRs from deletedList - DiskLoc i = cappedFirstDeletedInCurExtent(); - for (; !i.isNull() && inCapExtent(i); i = deletedRecordFor(i)->nextDeleted()) { - DDD("\t" << i); - drecs.push_back(i); - } - - setFirstDeletedInCurExtent(opCtx, i); - - std::sort(drecs.begin(), drecs.end()); - DDD("\t drecs.size(): " << drecs.size()); - - vector<DiskLoc>::const_iterator j = drecs.begin(); - invariant(j != drecs.end()); - DiskLoc a = *j; - while (1) { - j++; - if (j == drecs.end()) { - DDD("\t compact adddelrec"); - addDeletedRec(opCtx, a); - break; - } - DiskLoc b = *j; - while (a.a() == b.a() && a.getOfs() + drec(a)->lengthWithHeaders() == b.getOfs()) { - // a & b are adjacent. merge. - opCtx->recoveryUnit()->writingInt(drec(a)->lengthWithHeaders()) += - drec(b)->lengthWithHeaders(); - j++; - if (j == drecs.end()) { - DDD("\t compact adddelrec2"); - addDeletedRec(opCtx, a); - return; - } - b = *j; - } - DDD("\t compact adddelrec3"); - addDeletedRec(opCtx, a); - a = b; - } -} - -DiskLoc CappedRecordStoreV1::cappedFirstDeletedInCurExtent() const { - if (cappedLastDelRecLastExtent().isNull()) - return cappedListOfAllDeletedRecords(); - else - return drec(cappedLastDelRecLastExtent())->nextDeleted(); -} - -void CappedRecordStoreV1::setFirstDeletedInCurExtent(OperationContext* opCtx, const DiskLoc& loc) { - if (cappedLastDelRecLastExtent().isNull()) - setListOfAllDeletedRecords(opCtx, loc); - else - *opCtx->recoveryUnit()->writing(&drec(cappedLastDelRecLastExtent())->nextDeleted()) = loc; -} - -void CappedRecordStoreV1::cappedCheckMigrate(OperationContext* opCtx) { - // migrate old RecordStoreV1MetaData format - if (_details->capExtent().a() == 0 && _details->capExtent().getOfs() == 0) { - WriteUnitOfWork wunit(opCtx); - _details->setCapFirstNewRecord(opCtx, DiskLoc().setInvalid()); - // put all the DeletedRecords in cappedListOfAllDeletedRecords() - for (int i = 1; i < Buckets; ++i) { - DiskLoc first = _details->deletedListEntry(i); - if (first.isNull()) - continue; - DiskLoc last = first; - for (; !drec(last)->nextDeleted().isNull(); last = drec(last)->nextDeleted()) - ; - *opCtx->recoveryUnit()->writing(&drec(last)->nextDeleted()) = - cappedListOfAllDeletedRecords(); - setListOfAllDeletedRecords(opCtx, first); - _details->setDeletedListEntry(opCtx, i, DiskLoc()); - } - // NOTE cappedLastDelRecLastExtent() set to DiskLoc() in above - - // Last, in case we're killed before getting here - _details->setCapExtent(opCtx, _details->firstExtent(opCtx)); - wunit.commit(); - } -} - -bool CappedRecordStoreV1::inCapExtent(const DiskLoc& dl) const { - invariant(!dl.isNull()); - - if (dl.a() != _details->capExtent().a()) - return false; - - if (dl.getOfs() < _details->capExtent().getOfs()) - return false; - - const Extent* e = theCapExtent(); - int end = _details->capExtent().getOfs() + e->length; - return dl.getOfs() <= end; -} - -bool CappedRecordStoreV1::nextIsInCapExtent(const DiskLoc& dl) const { - invariant(!dl.isNull()); - DiskLoc next = drec(dl)->nextDeleted(); - if (next.isNull()) - return false; - return inCapExtent(next); -} - -void CappedRecordStoreV1::advanceCapExtent(OperationContext* opCtx, StringData ns) { - // We want cappedLastDelRecLastExtent() to be the last DeletedRecord of the prev cap extent - // (or DiskLoc() if new capExtent == firstExtent) - if (_details->capExtent() == _details->lastExtent(opCtx)) - setLastDelRecLastExtent(opCtx, DiskLoc()); - else { - DiskLoc i = cappedFirstDeletedInCurExtent(); - for (; !i.isNull() && nextIsInCapExtent(i); i = drec(i)->nextDeleted()) - ; - setLastDelRecLastExtent(opCtx, i); - } - - _details->setCapExtent(opCtx, - theCapExtent()->xnext.isNull() ? _details->firstExtent(opCtx) - : theCapExtent()->xnext); - - /* this isn't true if a collection has been renamed...that is ok just used for diagnostics */ - // dassert( theCapExtent()->ns == ns ); - - theCapExtent()->assertOk(); - _details->setCapFirstNewRecord(opCtx, DiskLoc()); -} - -DiskLoc CappedRecordStoreV1::__capAlloc(OperationContext* opCtx, int len) { - DiskLoc prev = cappedLastDelRecLastExtent(); - DiskLoc i = cappedFirstDeletedInCurExtent(); - DiskLoc ret; - for (; !i.isNull() && inCapExtent(i); prev = i, i = drec(i)->nextDeleted()) { - // We need to keep at least one DR per extent in cappedListOfAllDeletedRecords(), - // so make sure there's space to create a DR at the end. - if (drec(i)->lengthWithHeaders() >= len + 24) { - ret = i; - break; - } - } - - /* unlink ourself from the deleted list */ - if (!ret.isNull()) { - if (prev.isNull()) - setListOfAllDeletedRecords(opCtx, drec(ret)->nextDeleted()); - else - *opCtx->recoveryUnit()->writing(&drec(prev)->nextDeleted()) = drec(ret)->nextDeleted(); - *opCtx->recoveryUnit()->writing(&drec(ret)->nextDeleted()) = - DiskLoc().setInvalid(); // defensive. - invariant(drec(ret)->extentOfs() < ret.getOfs()); - } - - return ret; -} - -void CappedRecordStoreV1::cappedTruncateLastDelUpdate(OperationContext* opCtx) { - if (_details->capExtent() == _details->firstExtent(opCtx)) { - // Only one extent of the collection is in use, so there - // is no deleted record in a previous extent, so nullify - // cappedLastDelRecLastExtent(). - setLastDelRecLastExtent(opCtx, DiskLoc()); - } else { - // Scan through all deleted records in the collection - // until the last deleted record for the extent prior - // to the new capExtent is found. Then set - // cappedLastDelRecLastExtent() to that deleted record. - DiskLoc i = cappedListOfAllDeletedRecords(); - for (; !drec(i)->nextDeleted().isNull() && !inCapExtent(drec(i)->nextDeleted()); - i = drec(i)->nextDeleted()) - ; - // In our capped storage model, every extent must have at least one - // deleted record. Here we check that 'i' is not the last deleted - // record. (We expect that there will be deleted records in the new - // capExtent as well.) - invariant(!drec(i)->nextDeleted().isNull()); - setLastDelRecLastExtent(opCtx, i); - } -} - -void CappedRecordStoreV1::cappedTruncateAfter(OperationContext* opCtx, - const char* ns, - DiskLoc end, - bool inclusive) { - invariant(cappedLastDelRecLastExtent().isValid()); - - // We iteratively remove the newest document until the newest document - // is 'end', then we remove 'end' if requested. - bool foundLast = false; - while (1) { - if (foundLast) { - // 'end' has been found and removed, so break. - break; - } - // 'curr' will point to the newest document in the collection. - const DiskLoc curr = theCapExtent()->lastRecord; - const RecordId currId = curr.toRecordId(); - invariant(!curr.isNull()); - if (curr == end) { - if (inclusive) { - // 'end' has been found, so break next iteration. - foundLast = true; - } else { - // 'end' has been found, so break. - break; - } - } - - // TODO The algorithm used in this function cannot generate an - // empty collection, but we could call emptyCappedCollection() in - // this case instead of asserting. - uassert(13415, "emptying the collection is not allowed", _details->numRecords() > 1); - - WriteUnitOfWork wunit(opCtx); - // Delete the newest record, and coalesce the new deleted - // record with existing deleted records. - Status status = _cappedCallback->aboutToDeleteCapped(opCtx, currId, dataFor(opCtx, currId)); - uassertStatusOK(status); - deleteRecord(opCtx, currId); - _compact(opCtx); - - // This is the case where we have not yet had to remove any - // documents to make room for other documents, and we are allocating - // documents from free space in fresh extents instead of reusing - // space from familiar extents. - if (!_details->capLooped()) { - // We just removed the last record from the 'capExtent', and - // the 'capExtent' can't be empty, so we set 'capExtent' to - // capExtent's prev extent. - if (theCapExtent()->lastRecord.isNull()) { - invariant(!theCapExtent()->xprev.isNull()); - // NOTE Because we didn't delete the last document, and - // capLooped() is false, capExtent is not the first extent - // so xprev will be nonnull. - _details->setCapExtent(opCtx, theCapExtent()->xprev); - theCapExtent()->assertOk(); - - // update cappedLastDelRecLastExtent() - cappedTruncateLastDelUpdate(opCtx); - } - wunit.commit(); - continue; - } - - // This is the case where capLooped() is true, and we just deleted - // from capExtent, and we just deleted capFirstNewRecord, which was - // the last record on the fresh side of capExtent. - // NOTE In this comparison, curr and potentially capFirstNewRecord - // may point to invalid data, but we can still compare the - // references themselves. - if (curr == _details->capFirstNewRecord()) { - // Set 'capExtent' to the first nonempty extent prior to the - // initial capExtent. There must be such an extent because we - // have not deleted the last document in the collection. It is - // possible that all extents other than the capExtent are empty. - // In this case we will keep the initial capExtent and specify - // that all records contained within are on the fresh rather than - // stale side of the extent. - DiskLoc newCapExtent = _details->capExtent(); - do { - // Find the previous extent, looping if necessary. - newCapExtent = (newCapExtent == _details->firstExtent(opCtx)) - ? _details->lastExtent(opCtx) - : _extentManager->getExtent(newCapExtent)->xprev; - _extentManager->getExtent(newCapExtent)->assertOk(); - } while (_extentManager->getExtent(newCapExtent)->firstRecord.isNull()); - _details->setCapExtent(opCtx, newCapExtent); - - // Place all documents in the new capExtent on the fresh side - // of the capExtent by setting capFirstNewRecord to the first - // document in the new capExtent. - _details->setCapFirstNewRecord(opCtx, theCapExtent()->firstRecord); - - // update cappedLastDelRecLastExtent() - cappedTruncateLastDelUpdate(opCtx); - } - - wunit.commit(); - } -} - -DiskLoc CappedRecordStoreV1::cappedListOfAllDeletedRecords() const { - return _details->deletedListEntry(0); -} - -void CappedRecordStoreV1::setListOfAllDeletedRecords(OperationContext* opCtx, const DiskLoc& loc) { - return _details->setDeletedListEntry(opCtx, 0, loc); -} - -DiskLoc CappedRecordStoreV1::cappedLastDelRecLastExtent() const { - return _details->deletedListEntry(1); -} - -void CappedRecordStoreV1::setLastDelRecLastExtent(OperationContext* opCtx, const DiskLoc& loc) { - return _details->setDeletedListEntry(opCtx, 1, loc); -} - -Extent* CappedRecordStoreV1::theCapExtent() const { - return _extentManager->getExtent(_details->capExtent()); -} - -void CappedRecordStoreV1::addDeletedRec(OperationContext* opCtx, const DiskLoc& dloc) { - DeletedRecord* d = opCtx->recoveryUnit()->writing(drec(dloc)); - - if (!cappedLastDelRecLastExtent().isValid()) { - // Initial extent allocation. Insert at end. - d->nextDeleted() = DiskLoc(); - if (cappedListOfAllDeletedRecords().isNull()) - setListOfAllDeletedRecords(opCtx, dloc); - else { - DiskLoc i = cappedListOfAllDeletedRecords(); - for (; !drec(i)->nextDeleted().isNull(); i = drec(i)->nextDeleted()) - ; - *opCtx->recoveryUnit()->writing(&drec(i)->nextDeleted()) = dloc; - } - } else { - d->nextDeleted() = cappedFirstDeletedInCurExtent(); - setFirstDeletedInCurExtent(opCtx, dloc); - // always _compact() after this so order doesn't matter - } -} - -std::unique_ptr<SeekableRecordCursor> CappedRecordStoreV1::getCursor(OperationContext* opCtx, - bool forward) const { - return stdx::make_unique<CappedRecordStoreV1Iterator>(opCtx, this, forward); -} - -vector<std::unique_ptr<RecordCursor>> CappedRecordStoreV1::getManyCursors( - OperationContext* opCtx) const { - vector<std::unique_ptr<RecordCursor>> cursors; - - if (!_details->capLooped()) { - // if we haven't looped yet, just spit out all extents (same as non-capped impl) - const Extent* ext; - for (DiskLoc extLoc = details()->firstExtent(opCtx); !extLoc.isNull(); - extLoc = ext->xnext) { - ext = _getExtent(opCtx, extLoc); - if (ext->firstRecord.isNull()) - continue; - - cursors.push_back(stdx::make_unique<RecordStoreV1Base::IntraExtentIterator>( - opCtx, ext->firstRecord, this)); - } - } else { - // if we've looped we need to iterate the extents, starting and ending with the - // capExtent - const DiskLoc capExtent = details()->capExtent(); - invariant(!capExtent.isNull()); - invariant(capExtent.isValid()); - - // First do the "old" portion of capExtent if there is any - DiskLoc extLoc = capExtent; - { - const Extent* ext = _getExtent(opCtx, extLoc); - if (ext->firstRecord != details()->capFirstNewRecord()) { - // this means there is old data in capExtent - cursors.push_back(stdx::make_unique<RecordStoreV1Base::IntraExtentIterator>( - opCtx, ext->firstRecord, this)); - } - - extLoc = ext->xnext.isNull() ? details()->firstExtent(opCtx) : ext->xnext; - } - - // Next handle all the other extents - while (extLoc != capExtent) { - const Extent* ext = _getExtent(opCtx, extLoc); - cursors.push_back(stdx::make_unique<RecordStoreV1Base::IntraExtentIterator>( - opCtx, ext->firstRecord, this)); - - extLoc = ext->xnext.isNull() ? details()->firstExtent(opCtx) : ext->xnext; - } - - // Finally handle the "new" data in the capExtent - cursors.push_back(stdx::make_unique<RecordStoreV1Base::IntraExtentIterator>( - opCtx, details()->capFirstNewRecord(), this)); - } - - return cursors; -} - -void CappedRecordStoreV1::_maybeComplain(OperationContext* opCtx, int len) const { - RARELY { - std::stringstream buf; - buf << "couldn't make room for record len: " << len << " in capped ns " << _ns << '\n'; - buf << "numRecords: " << numRecords(opCtx) << '\n'; - int i = 0; - for (DiskLoc e = _details->firstExtent(opCtx); !e.isNull(); - e = _extentManager->getExtent(e)->xnext, ++i) { - buf << " Extent " << i; - if (e == _details->capExtent()) - buf << " (capExtent)"; - buf << ' ' << e; - buf << '\n'; - - buf << " magic: " << hex << _extentManager->getExtent(e)->magic << dec - << " extent->ns: " << _extentManager->getExtent(e)->nsDiagnostic.toString() << '\n'; - buf << " fr: " << _extentManager->getExtent(e)->firstRecord.toString() - << " lr: " << _extentManager->getExtent(e)->lastRecord.toString() - << " extent->len: " << _extentManager->getExtent(e)->length << '\n'; - } - - warning() << buf.str(); - - // assume it is unusually large record; if not, something is broken - fassert(17438, len * 5 > _details->lastExtentSize(opCtx)); - } -} - -DiskLoc CappedRecordStoreV1::firstRecord(OperationContext* opCtx, - const DiskLoc& startExtent) const { - for (DiskLoc i = startExtent.isNull() ? _details->firstExtent(opCtx) : startExtent; !i.isNull(); - i = _extentManager->getExtent(i)->xnext) { - Extent* e = _extentManager->getExtent(i); - - if (!e->firstRecord.isNull()) - return e->firstRecord; - } - return DiskLoc(); -} - -DiskLoc CappedRecordStoreV1::lastRecord(OperationContext* opCtx, const DiskLoc& startExtent) const { - for (DiskLoc i = startExtent.isNull() ? _details->lastExtent(opCtx) : startExtent; !i.isNull(); - i = _extentManager->getExtent(i)->xprev) { - Extent* e = _extentManager->getExtent(i); - if (!e->lastRecord.isNull()) - return e->lastRecord; - } - return DiskLoc(); -} -} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped.h b/src/mongo/db/storage/mmap_v1/record_store_v1_capped.h deleted file mode 100644 index d74fc7c65ea..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_store_v1_capped.h +++ /dev/null @@ -1,129 +0,0 @@ -// record_store_v1_capped.h - -/** -* Copyright (C) 2013 10gen Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include "mongo/base/owned_pointer_vector.h" -#include "mongo/db/storage/capped_callback.h" -#include "mongo/db/storage/mmap_v1/diskloc.h" -#include "mongo/db/storage/mmap_v1/extent_manager.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_base.h" - -namespace mongo { - -class CappedRecordStoreV1 final : public RecordStoreV1Base { -public: - CappedRecordStoreV1(OperationContext* opCtx, - CappedCallback* collection, - StringData ns, - RecordStoreV1MetaData* details, - ExtentManager* em, - bool isSystemIndexes); - - ~CappedRecordStoreV1() final; - - const char* name() const final { - return "CappedRecordStoreV1"; - } - - Status truncate(OperationContext* opCtx) final; - - /** - * Truncate documents newer than the document at 'end' from the capped - * collection. The collection cannot be completely emptied using this - * function. An assertion will be thrown if that is attempted. - * @param inclusive - Truncate 'end' as well iff true - */ - void cappedTruncateAfter(OperationContext* opCtx, RecordId end, bool inclusive) final; - - std::unique_ptr<SeekableRecordCursor> getCursor(OperationContext* opCtx, - bool forward) const final; - - std::vector<std::unique_ptr<RecordCursor>> getManyCursors(OperationContext* opCtx) const final; - - // Start from firstExtent by default. - DiskLoc firstRecord(OperationContext* opCtx, const DiskLoc& startExtent = DiskLoc()) const; - // Start from lastExtent by default. - DiskLoc lastRecord(OperationContext* opCtx, const DiskLoc& startExtent = DiskLoc()) const; - -protected: - bool isCapped() const final { - return true; - } - bool shouldPadInserts() const final { - return false; - } - - void setCappedCallback(CappedCallback* cb) final { - _cappedCallback = cb; - } - - StatusWith<DiskLoc> allocRecord(OperationContext* opCtx, - int lengthWithHeaders, - bool enforceQuota) final; - - void addDeletedRec(OperationContext* opCtx, const DiskLoc& dloc) final; - -private: - // -- start copy from cap.cpp -- - void _compact(OperationContext* opCtx); - DiskLoc cappedFirstDeletedInCurExtent() const; - void setFirstDeletedInCurExtent(OperationContext* opCtx, const DiskLoc& loc); - void cappedCheckMigrate(OperationContext* opCtx); - DiskLoc __capAlloc(OperationContext* opCtx, int len); - bool inCapExtent(const DiskLoc& dl) const; - DiskLoc cappedListOfAllDeletedRecords() const; - DiskLoc cappedLastDelRecLastExtent() const; - void setListOfAllDeletedRecords(OperationContext* opCtx, const DiskLoc& loc); - void setLastDelRecLastExtent(OperationContext* opCtx, const DiskLoc& loc); - Extent* theCapExtent() const; - bool nextIsInCapExtent(const DiskLoc& dl) const; - void advanceCapExtent(OperationContext* opCtx, StringData ns); - void cappedTruncateLastDelUpdate(OperationContext* opCtx); - - /** - * Truncate documents newer than the document at 'end' from the capped - * collection. The collection cannot be completely emptied using this - * function. An assertion will be thrown if that is attempted. - * @param inclusive - Truncate 'end' as well iff true - */ - void cappedTruncateAfter(OperationContext* opCtx, const char* ns, DiskLoc end, bool inclusive); - - void _maybeComplain(OperationContext* opCtx, int len) const; - - // -- end copy from cap.cpp -- - - CappedCallback* _cappedCallback; - - OwnedPointerVector<ExtentManager::CacheHint> _extentAdvice; - - friend class CappedRecordStoreV1Iterator; -}; -} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp deleted file mode 100644 index 20324ffe5ee..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.cpp +++ /dev/null @@ -1,215 +0,0 @@ -/** - * Copyright (C) 2013 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h" - -#include "mongo/db/catalog/collection.h" -#include "mongo/db/storage/mmap_v1/extent.h" -#include "mongo/db/storage/mmap_v1/extent_manager.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h" - -namespace mongo { - - -// -// Capped collection traversal -// -CappedRecordStoreV1Iterator::CappedRecordStoreV1Iterator(OperationContext* opCtx, - const CappedRecordStoreV1* collection, - bool forward) - : _opCtx(opCtx), _recordStore(collection), _forward(forward) { - const RecordStoreV1MetaData* nsd = _recordStore->details(); - - // If a start position isn't specified, we fill one out from the start of the - // collection. - if (_forward) { - // Going forwards. - if (!nsd->capLooped()) { - // If our capped collection doesn't loop around, the first record is easy. - _curr = collection->firstRecord(_opCtx); - } else { - // Our capped collection has "looped' around. - // Copied verbatim from ForwardCappedCursor::init. - // TODO ELABORATE - _curr = _getExtent(nsd->capExtent())->firstRecord; - if (!_curr.isNull() && _curr == nsd->capFirstNewRecord()) { - _curr = _getExtent(nsd->capExtent())->lastRecord; - _curr = nextLoop(_curr); - } - } - } else { - // Going backwards - if (!nsd->capLooped()) { - // Start at the end. - _curr = collection->lastRecord(_opCtx); - } else { - _curr = _getExtent(nsd->capExtent())->lastRecord; - } - } -} - -boost::optional<Record> CappedRecordStoreV1Iterator::next() { - if (isEOF()) - return {}; - auto toReturn = _curr.toRecordId(); - _curr = getNextCapped(_curr); - return {{toReturn, _recordStore->RecordStore::dataFor(_opCtx, toReturn)}}; -} - -boost::optional<Record> CappedRecordStoreV1Iterator::seekExact(const RecordId& id) { - _curr = getNextCapped(DiskLoc::fromRecordId(id)); - return {{id, _recordStore->RecordStore::dataFor(_opCtx, id)}}; -} - -void CappedRecordStoreV1Iterator::invalidate(OperationContext* opCtx, const RecordId& id) { - const DiskLoc dl = DiskLoc::fromRecordId(id); - if (dl == _curr) { - // We *could* move to the next thing, since there is actually a next - // thing, but according to clientcursor.cpp: - // "note we cannot advance here. if this condition occurs, writes to the oplog - // have "caught" the reader. skipping ahead, the reader would miss potentially - // important data." - // We don't really need to worry about rollback here, as the very next write would - // invalidate the cursor anyway. - _curr = DiskLoc(); - _killedByInvalidate = true; - } -} - -void CappedRecordStoreV1Iterator::save() {} - -bool CappedRecordStoreV1Iterator::restore() { - return !_killedByInvalidate; -} - -DiskLoc CappedRecordStoreV1Iterator::getNextCapped(const DiskLoc& dl) { - invariant(!dl.isNull()); - const RecordStoreV1MetaData* details = _recordStore->details(); - - if (_forward) { - // If it's not looped, it's easy. - if (!_recordStore->details()->capLooped()) { - return _getNextRecord(dl); - } - - // TODO ELABORATE - // EOF. - if (dl == _getExtent(details->capExtent())->lastRecord) { - return DiskLoc(); - } - - DiskLoc ret = nextLoop(dl); - - // If we become capFirstNewRecord from same extent, advance to next extent. - if (ret == details->capFirstNewRecord() && - ret != _getExtent(details->capExtent())->firstRecord) { - ret = nextLoop(_getExtent(details->capExtent())->lastRecord); - } - - // If we have just gotten to beginning of capExtent, skip to capFirstNewRecord - if (ret == _getExtent(details->capExtent())->firstRecord) { - ret = details->capFirstNewRecord(); - } - - return ret; - } else { - if (!details->capLooped()) { - return _getPrevRecord(dl); - } - - // TODO ELABORATE - // Last record - if (details->capFirstNewRecord() == _getExtent(details->capExtent())->firstRecord) { - if (dl == nextLoop(_getExtent(details->capExtent())->lastRecord)) { - return DiskLoc(); - } - } else { - if (dl == _getExtent(details->capExtent())->firstRecord) { - return DiskLoc(); - } - } - - DiskLoc ret; - // If we are capFirstNewRecord, advance to prev extent, otherwise just get prev. - if (dl == details->capFirstNewRecord()) { - ret = prevLoop(_getExtent(details->capExtent())->firstRecord); - } else { - ret = prevLoop(dl); - } - - // If we just became last in cap extent, advance past capFirstNewRecord - // (We know ext(capExtent)->firstRecord != capFirstNewRecord, since would - // have returned DiskLoc() earlier otherwise.) - if (ret == _getExtent(details->capExtent())->lastRecord) { - ret = _getPrevRecord(details->capFirstNewRecord()); - } - - return ret; - } -} - -DiskLoc CappedRecordStoreV1Iterator::nextLoop(const DiskLoc& prev) { - // TODO ELABORATE - DiskLoc next = _getNextRecord(prev); - if (!next.isNull()) { - return next; - } - return _recordStore->firstRecord(_opCtx); -} - -DiskLoc CappedRecordStoreV1Iterator::prevLoop(const DiskLoc& curr) { - // TODO ELABORATE - DiskLoc prev = _getPrevRecord(curr); - if (!prev.isNull()) { - return prev; - } - return _recordStore->lastRecord(_opCtx); -} - - -Extent* CappedRecordStoreV1Iterator::_getExtent(const DiskLoc& loc) { - return _recordStore->_extentManager->getExtent(loc); -} - -DiskLoc CappedRecordStoreV1Iterator::_getNextRecord(const DiskLoc& loc) { - return _recordStore->getNextRecord(_opCtx, loc); -} - -DiskLoc CappedRecordStoreV1Iterator::_getPrevRecord(const DiskLoc& loc) { - return _recordStore->getPrevRecord(_opCtx, loc); -} - -std::unique_ptr<RecordFetcher> CappedRecordStoreV1Iterator::fetcherForNext() const { - return _recordStore->_extentManager->recordNeedsFetch(_curr); -} - -std::unique_ptr<RecordFetcher> CappedRecordStoreV1Iterator::fetcherForId(const RecordId& id) const { - return _recordStore->_extentManager->recordNeedsFetch(DiskLoc::fromRecordId(id)); -} - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h deleted file mode 100644 index 08065109c3f..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h +++ /dev/null @@ -1,98 +0,0 @@ -/** - * Copyright (C) 2013 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/storage/mmap_v1/diskloc.h" -#include "mongo/db/storage/record_store.h" - -namespace mongo { - -class CappedRecordStoreV1; - -struct Extent; - -/** - * This class iterates over a capped collection identified by 'ns'. - * The collection must exist when the constructor is called. - */ -class CappedRecordStoreV1Iterator final : public SeekableRecordCursor { -public: - CappedRecordStoreV1Iterator(OperationContext* opCtx, - const CappedRecordStoreV1* collection, - bool forward); - - boost::optional<Record> next() final; - boost::optional<Record> seekExact(const RecordId& id) final; - void save() final; - bool restore() final; - void detachFromOperationContext() final { - _opCtx = nullptr; - } - void reattachToOperationContext(OperationContext* opCtx) final { - _opCtx = opCtx; - } - void invalidate(OperationContext* opCtx, const RecordId& dl) final; - std::unique_ptr<RecordFetcher> fetcherForNext() const final; - std::unique_ptr<RecordFetcher> fetcherForId(const RecordId& id) const final; - -private: - void advance(); - bool isEOF() { - return _curr.isNull(); - } - - /** - * Internal collection navigation helper methods. - */ - DiskLoc getNextCapped(const DiskLoc& dl); - DiskLoc prevLoop(const DiskLoc& curr); - DiskLoc nextLoop(const DiskLoc& prev); - - // some helpers - these move to RecordStore probably - Extent* _getExtent(const DiskLoc& loc); - DiskLoc _getNextRecord(const DiskLoc& loc); - DiskLoc _getPrevRecord(const DiskLoc& loc); - - // transactional context for read locks. Not owned by us - OperationContext* _opCtx; - - // The collection we're iterating over. - const CappedRecordStoreV1* const _recordStore; - - // The result returned on the next call to getNext(). - DiskLoc _curr; - - const bool _forward; - - // If invalidate kills the DiskLoc we need to move forward, we kill the iterator. See the - // comment in the body of invalidate(...). - bool _killedByInvalidate = false; -}; - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp deleted file mode 100644 index 280ad6ccee0..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_store_v1_capped_test.cpp +++ /dev/null @@ -1,797 +0,0 @@ -// record_store_v1_capped_test.cpp - -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_capped_iterator.h" - -#include "mongo/db/operation_context_noop.h" -#include "mongo/db/storage/mmap_v1/extent.h" -#include "mongo/db/storage/mmap_v1/record.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h" - -#include "mongo/unittest/unittest.h" - -using namespace mongo; - -namespace { - -using std::string; -using std::vector; - -// Provides data to be inserted. Must be large enough for largest possible record. -// Should be in BSS so unused portions should be free. -char zeros[20 * 1024 * 1024] = {}; - -class DummyCappedCallback : public CappedCallback { -public: - Status aboutToDeleteCapped(OperationContext* opCtx, const RecordId& loc, RecordData data) { - deleted.push_back(DiskLoc::fromRecordId(loc)); - return Status::OK(); - } - - bool haveCappedWaiters() { - return false; - } - void notifyCappedWaitersIfNeeded() {} - - vector<DiskLoc> deleted; -}; - -void simpleInsertTest(const char* buf, int size) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0); - DummyCappedCallback cb; - - string myns = "test.simple1"; - CappedRecordStoreV1 rs(&opCtx, &cb, myns, md, &em, false); - - rs.increaseStorageSize(&opCtx, 1024, false); - - ASSERT_NOT_OK(rs.insertRecord(&opCtx, buf, 3, Timestamp(), true).getStatus()); - - ASSERT_OK(rs.insertRecord(&opCtx, buf, size, Timestamp(), true).getStatus()); - - { - BSONObjBuilder b; - int64_t storageSize = rs.storageSize(&opCtx, &b); - BSONObj obj = b.obj(); - ASSERT_EQUALS(1, obj["numExtents"].numberInt()); - ASSERT_EQUALS(storageSize, em.quantizeExtentSize(1024)); - } - - for (int i = 0; i < 1000; i++) { - ASSERT_OK(rs.insertRecord(&opCtx, buf, size, Timestamp(), true).getStatus()); - } - - long long start = md->numRecords(); - for (int i = 0; i < 1000; i++) { - ASSERT_OK(rs.insertRecord(&opCtx, buf, size, Timestamp(), true).getStatus()); - } - ASSERT_EQUALS(start, md->numRecords()); - ASSERT_GREATER_THAN(start, 100); - ASSERT_LESS_THAN(start, 1000); -} - -TEST(CappedRecordStoreV1, SimpleInsertSize4) { - simpleInsertTest("abcd", 4); -} -TEST(CappedRecordStoreV1, SimpleInsertSize8) { - simpleInsertTest("abcdefgh", 8); -} - -TEST(CappedRecordStoreV1, EmptySingleExtent) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0); - DummyCappedCallback cb; - CappedRecordStoreV1 rs(&opCtx, &cb, "test.foo", md, &em, false); - - { - LocAndSize records[] = {{}}; - LocAndSize drecs[] = {{DiskLoc(0, 1000), 1000}, {}}; - md->setCapExtent(&opCtx, DiskLoc(0, 0)); - md->setCapFirstNewRecord(&opCtx, DiskLoc().setInvalid()); - initializeV1RS(&opCtx, records, drecs, NULL, &em, md); - } - - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 100 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - - - { - LocAndSize recs[] = {{DiskLoc(0, 1000), 100}, {}}; - LocAndSize drecs[] = {{DiskLoc(0, 1100), 900}, {}}; - assertStateV1RS(&opCtx, recs, drecs, NULL, &em, md); - ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0)); - ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc().setInvalid()); // unlooped - } -} - -TEST(CappedRecordStoreV1, FirstLoopWithSingleExtentExactSize) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0); - DummyCappedCallback cb; - CappedRecordStoreV1 rs(&opCtx, &cb, "test.foo", md, &em, false); - - { - LocAndSize records[] = {{DiskLoc(0, 1000), 100}, - {DiskLoc(0, 1100), 100}, - {DiskLoc(0, 1200), 100}, - {DiskLoc(0, 1300), 100}, - {DiskLoc(0, 1400), 100}, - {}}; - LocAndSize drecs[] = {{DiskLoc(0, 1500), 50}, {}}; - md->setCapExtent(&opCtx, DiskLoc(0, 0)); - md->setCapFirstNewRecord(&opCtx, DiskLoc().setInvalid()); // unlooped - initializeV1RS(&opCtx, records, drecs, NULL, &em, md); - } - - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 100 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - - { - LocAndSize recs[] = {{DiskLoc(0, 1200), 100}, // first old record - {DiskLoc(0, 1300), 100}, - {DiskLoc(0, 1400), 100}, // last old record - {DiskLoc(0, 1000), 100}, // first new record - {}}; - LocAndSize drecs[] = { - {DiskLoc(0, 1100), 100}, // gap after newest record XXX this is probably a bug - {DiskLoc(0, 1500), 50}, // gap at end of extent - {}}; - assertStateV1RS(&opCtx, recs, drecs, NULL, &em, md); - ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0)); - ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000)); - } -} - -TEST(CappedRecordStoreV1, NonFirstLoopWithSingleExtentExactSize) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0); - DummyCappedCallback cb; - CappedRecordStoreV1 rs(&opCtx, &cb, "test.foo", md, &em, false); - - { - LocAndSize records[] = {{DiskLoc(0, 1000), 100}, - {DiskLoc(0, 1100), 100}, - {DiskLoc(0, 1200), 100}, - {DiskLoc(0, 1300), 100}, - {DiskLoc(0, 1400), 100}, - {}}; - LocAndSize drecs[] = {{DiskLoc(0, 1500), 50}, {}}; - md->setCapExtent(&opCtx, DiskLoc(0, 0)); - md->setCapFirstNewRecord(&opCtx, DiskLoc(0, 1000)); - initializeV1RS(&opCtx, records, drecs, NULL, &em, md); - } - - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 100 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - - { - LocAndSize recs[] = {{DiskLoc(0, 1200), 100}, // first old record - {DiskLoc(0, 1300), 100}, - {DiskLoc(0, 1400), 100}, // last old record - {DiskLoc(0, 1000), 100}, // first new record - {}}; - LocAndSize drecs[] = { - {DiskLoc(0, 1100), 100}, // gap after newest record XXX this is probably a bug - {DiskLoc(0, 1500), 50}, // gap at end of extent - {}}; - assertStateV1RS(&opCtx, recs, drecs, NULL, &em, md); - ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0)); - ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000)); - } -} - -/** - * Current code always tries to leave 24 bytes to create a DeletedRecord. - */ -TEST(CappedRecordStoreV1, WillLoopWithout24SpareBytes) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0); - DummyCappedCallback cb; - CappedRecordStoreV1 rs(&opCtx, &cb, "test.foo", md, &em, false); - - { - LocAndSize records[] = {{DiskLoc(0, 1000), 100}, - {DiskLoc(0, 1100), 100}, - {DiskLoc(0, 1200), 100}, - {DiskLoc(0, 1300), 100}, - {DiskLoc(0, 1400), 100}, - {}}; - LocAndSize drecs[] = {{DiskLoc(0, 1500), 123}, {}}; - md->setCapExtent(&opCtx, DiskLoc(0, 0)); - md->setCapFirstNewRecord(&opCtx, DiskLoc(0, 1000)); - initializeV1RS(&opCtx, records, drecs, NULL, &em, md); - } - - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 100 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - - { - LocAndSize recs[] = {{DiskLoc(0, 1200), 100}, // first old record - {DiskLoc(0, 1300), 100}, - {DiskLoc(0, 1400), 100}, // last old record - {DiskLoc(0, 1000), 100}, // first new record - {}}; - LocAndSize drecs[] = {{DiskLoc(0, 1100), 100}, // gap after newest record - {DiskLoc(0, 1500), 123}, // gap at end of extent - {}}; - assertStateV1RS(&opCtx, recs, drecs, NULL, &em, md); - ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0)); - ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000)); - } -} - -TEST(CappedRecordStoreV1, WontLoopWith24SpareBytes) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0); - DummyCappedCallback cb; - CappedRecordStoreV1 rs(&opCtx, &cb, "test.foo", md, &em, false); - - { - LocAndSize records[] = {{DiskLoc(0, 1000), 100}, - {DiskLoc(0, 1100), 100}, - {DiskLoc(0, 1200), 100}, - {DiskLoc(0, 1300), 100}, - {DiskLoc(0, 1400), 100}, - {}}; - LocAndSize drecs[] = {{DiskLoc(0, 1500), 124}, {}}; - md->setCapExtent(&opCtx, DiskLoc(0, 0)); - md->setCapFirstNewRecord(&opCtx, DiskLoc(0, 1000)); - initializeV1RS(&opCtx, records, drecs, NULL, &em, md); - } - - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 100 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - - { - LocAndSize recs[] = {{DiskLoc(0, 1000), 100}, - {DiskLoc(0, 1100), 100}, - {DiskLoc(0, 1200), 100}, - {DiskLoc(0, 1300), 100}, - {DiskLoc(0, 1400), 100}, - {DiskLoc(0, 1500), 100}, - {}}; - LocAndSize drecs[] = {{DiskLoc(0, 1600), 24}, // gap at end of extent - {}}; - assertStateV1RS(&opCtx, recs, drecs, NULL, &em, md); - ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0)); - ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000)); - } -} - -TEST(CappedRecordStoreV1, MoveToSecondExtentUnLooped) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0); - DummyCappedCallback cb; - CappedRecordStoreV1 rs(&opCtx, &cb, "test.foo", md, &em, false); - - { - // Two extents, each with 1000 bytes. - LocAndSize records[] = { - {DiskLoc(0, 1000), 500}, {DiskLoc(0, 1500), 300}, {DiskLoc(0, 1800), 100}, {}}; - LocAndSize drecs[] = {{DiskLoc(0, 1900), 100}, {DiskLoc(1, 1000), 1000}, {}}; - md->setCapExtent(&opCtx, DiskLoc(0, 0)); - md->setCapFirstNewRecord(&opCtx, DiskLoc().setInvalid()); - initializeV1RS(&opCtx, records, drecs, NULL, &em, md); - } - - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 100 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - - { - LocAndSize recs[] = {{DiskLoc(0, 1000), 500}, - {DiskLoc(0, 1500), 300}, - {DiskLoc(0, 1800), 100}, - - {DiskLoc(1, 1000), 100}, - {}}; - LocAndSize drecs[] = {{DiskLoc(0, 1900), 100}, {DiskLoc(1, 1100), 900}, {}}; - assertStateV1RS(&opCtx, recs, drecs, NULL, &em, md); - ASSERT_EQUALS(md->capExtent(), DiskLoc(1, 0)); - ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc().setInvalid()); // unlooped - } -} - -TEST(CappedRecordStoreV1, MoveToSecondExtentLooped) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0); - DummyCappedCallback cb; - CappedRecordStoreV1 rs(&opCtx, &cb, "test.foo", md, &em, false); - - { - // Two extents, each with 1000 bytes. - LocAndSize records[] = {{DiskLoc(0, 1800), 100}, // old - {DiskLoc(0, 1000), 500}, // first new - {DiskLoc(0, 1500), 400}, - - {DiskLoc(1, 1000), 300}, - {DiskLoc(1, 1300), 600}, - {}}; - LocAndSize drecs[] = {{DiskLoc(0, 1900), 100}, {DiskLoc(1, 1900), 100}, {}}; - md->setCapExtent(&opCtx, DiskLoc(0, 0)); - md->setCapFirstNewRecord(&opCtx, DiskLoc(0, 1000)); - initializeV1RS(&opCtx, records, drecs, NULL, &em, md); - } - - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 200 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - - { - LocAndSize recs[] = {{DiskLoc(0, 1000), 500}, - {DiskLoc(0, 1500), 400}, - - {DiskLoc(1, 1300), 600}, // old - {DiskLoc(1, 1000), 200}, // first new - {}}; - LocAndSize drecs[] = { - {DiskLoc(0, 1800), 200}, {DiskLoc(1, 1200), 100}, {DiskLoc(1, 1900), 100}, {}}; - assertStateV1RS(&opCtx, recs, drecs, NULL, &em, md); - ASSERT_EQUALS(md->capExtent(), DiskLoc(1, 0)); - ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(1, 1000)); - } -} - -// Larger than storageSize (fails early) -TEST(CappedRecordStoreV1, OversizedRecordHuge) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0); - DummyCappedCallback cb; - CappedRecordStoreV1 rs(&opCtx, &cb, "test.foo", md, &em, false); - - { - LocAndSize records[] = {{}}; - LocAndSize drecs[] = {{DiskLoc(0, 1000), 1000}, {}}; - md->setCapExtent(&opCtx, DiskLoc(0, 0)); - md->setCapFirstNewRecord(&opCtx, DiskLoc().setInvalid()); - initializeV1RS(&opCtx, records, drecs, NULL, &em, md); - } - - StatusWith<RecordId> status = rs.insertRecord(&opCtx, zeros, 16000, Timestamp(), false); - ASSERT_EQUALS(status.getStatus(), ErrorCodes::DocTooLargeForCapped); - ASSERT_STRING_CONTAINS(status.getStatus().reason(), "larger than capped size"); -} - -// Smaller than storageSize, but larger than usable space (fails late) -TEST(CappedRecordStoreV1, OversizedRecordMedium) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0); - DummyCappedCallback cb; - CappedRecordStoreV1 rs(&opCtx, &cb, "test.foo", md, &em, false); - - { - LocAndSize records[] = {{}}; - LocAndSize drecs[] = {{DiskLoc(0, 1000), 1000}, {}}; - md->setCapExtent(&opCtx, DiskLoc(0, 0)); - md->setCapFirstNewRecord(&opCtx, DiskLoc().setInvalid()); - initializeV1RS(&opCtx, records, drecs, NULL, &em, md); - } - - StatusWith<RecordId> status = - rs.insertRecord(&opCtx, zeros, 1004 - MmapV1RecordHeader::HeaderSize, Timestamp(), false); - ASSERT_EQUALS(status.getStatus(), ErrorCodes::DocTooLargeForCapped); - ASSERT_STRING_CONTAINS(status.getStatus().reason(), "doesn't fit"); -} - -// -// XXX The CappedRecordStoreV1Scrambler suite of tests describe existing behavior that is less -// than ideal. Any improved implementation will need to be able to handle a collection that has -// been scrambled like this. -// - -/** - * This is a minimal example that shows the current allocator laying out records out-of-order. - */ -TEST(CappedRecordStoreV1Scrambler, Minimal) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0); - DummyCappedCallback cb; - CappedRecordStoreV1 rs(&opCtx, &cb, "test.foo", md, &em, false); - - { - // Starting with a single empty 1000 byte extent. - LocAndSize records[] = {{}}; - LocAndSize drecs[] = {{DiskLoc(0, 1000), 1000}, {}}; - md->setCapExtent(&opCtx, DiskLoc(0, 0)); - md->setCapFirstNewRecord(&opCtx, DiskLoc().setInvalid()); // unlooped - initializeV1RS(&opCtx, records, drecs, NULL, &em, md); - } - - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 500 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 300 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 400 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); // won't fit at end so wraps - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 120 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); // fits at end - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 60 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); // fits in earlier hole - - { - LocAndSize recs[] = {{DiskLoc(0, 1500), 300}, // 2nd insert - {DiskLoc(0, 1000), 400}, // 3rd (1st new) - {DiskLoc(0, 1800), 120}, // 4th - {DiskLoc(0, 1400), 60}, // 5th - {}}; - LocAndSize drecs[] = {{DiskLoc(0, 1460), 40}, {DiskLoc(0, 1920), 80}, {}}; - assertStateV1RS(&opCtx, recs, drecs, NULL, &em, md); - ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0)); - ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000)); - } -} - -/** - * This tests a specially crafted set of inserts that scrambles a capped collection in a way - * that leaves 4 deleted records in a single extent. - */ -TEST(CappedRecordStoreV1Scrambler, FourDeletedRecordsInSingleExtent) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(true, 0); - DummyCappedCallback cb; - CappedRecordStoreV1 rs(&opCtx, &cb, "test.foo", md, &em, false); - - { - // Starting with a single empty 1000 byte extent. - LocAndSize records[] = {{}}; - LocAndSize drecs[] = {{DiskLoc(0, 1000), 1000}, {}}; - md->setCapExtent(&opCtx, DiskLoc(0, 0)); - md->setCapFirstNewRecord(&opCtx, DiskLoc().setInvalid()); // unlooped - initializeV1RS(&opCtx, records, drecs, NULL, &em, md); - } - - // This list of sizes was empirically generated to achieve this outcome. Don't think too - // much about them. - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 500 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 300 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 304 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 76 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 100 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 96 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 76 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 200 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 100 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 100 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 200 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 56 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 100 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 96 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 104 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 96 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 60 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 60 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 146 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 146 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 40 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 40 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 36 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 100 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 96 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 200 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 60 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - ASSERT_OK( - rs.insertRecord(&opCtx, zeros, 64 - MmapV1RecordHeader::HeaderSize, Timestamp(), false) - .getStatus()); - - { - LocAndSize recs[] = {{DiskLoc(0, 1148), 148}, - {DiskLoc(0, 1936), 40}, - {DiskLoc(0, 1712), 40}, - {DiskLoc(0, 1296), 36}, - {DiskLoc(0, 1752), 100}, - {DiskLoc(0, 1332), 96}, - {DiskLoc(0, 1428), 200}, - {DiskLoc(0, 1852), 60}, - {DiskLoc(0, 1000), 64}, // (1st new) - {}}; - LocAndSize drecs[] = {{DiskLoc(0, 1064), 84}, - {DiskLoc(0, 1976), 24}, - {DiskLoc(0, 1912), 24}, - {DiskLoc(0, 1628), 84}, - {}}; - assertStateV1RS(&opCtx, recs, drecs, NULL, &em, md); - ASSERT_EQUALS(md->capExtent(), DiskLoc(0, 0)); - ASSERT_EQUALS(md->capFirstNewRecord(), DiskLoc(0, 1000)); - } -} - -// -// The CappedRecordStoreV1QueryStage tests some nitty-gritty capped -// collection details. Ported and polished from pdfiletests.cpp. -// - -class CollscanHelper { -public: - CollscanHelper(int nExtents) - : md(new DummyRecordStoreV1MetaData(true, 0)), rs(&opCtx, &cb, ns(), md, &em, false) { - LocAndSize recs[] = {{}}; - LocAndSize drecs[8]; - ASSERT_LESS_THAN(nExtents, 8); - for (int j = 0; j < nExtents; ++j) { - drecs[j].loc = DiskLoc(j, 1000); - drecs[j].size = 1000; - } - drecs[nExtents].loc = DiskLoc(); - drecs[nExtents].size = 0; - - md->setCapExtent(&opCtx, DiskLoc(0, 0)); - md->setCapFirstNewRecord(&opCtx, DiskLoc().setInvalid()); // unlooped - initializeV1RS(&opCtx, recs, drecs, NULL, &em, md); - } - - // Insert bypasses standard alloc/insert routines to use the extent we want. - // TODO: Directly declare resulting record store state instead of procedurally creating it - DiskLoc insert(const DiskLoc& ext, int i) { - // Copied verbatim. - BSONObjBuilder b; - b.append("a", i); - BSONObj o = b.done(); - int len = o.objsize(); - Extent* e = em.getExtent(ext); - e = opCtx.recoveryUnit()->writing(e); - int ofs; - if (e->lastRecord.isNull()) { - ofs = ext.getOfs() + (e->_extentData - (char*)e); - } else { - ofs = e->lastRecord.getOfs() + em.recordForV1(e->lastRecord)->lengthWithHeaders(); - } - DiskLoc dl(ext.a(), ofs); - MmapV1RecordHeader* r = em.recordForV1(dl); - r = (MmapV1RecordHeader*)opCtx.recoveryUnit()->writingPtr( - r, MmapV1RecordHeader::HeaderSize + len); - r->lengthWithHeaders() = MmapV1RecordHeader::HeaderSize + len; - r->extentOfs() = e->myLoc.getOfs(); - r->nextOfs() = DiskLoc::NullOfs; - r->prevOfs() = e->lastRecord.isNull() ? DiskLoc::NullOfs : e->lastRecord.getOfs(); - memcpy(r->data(), o.objdata(), len); - if (e->firstRecord.isNull()) - e->firstRecord = dl; - else - opCtx.recoveryUnit()->writingInt(em.recordForV1(e->lastRecord)->nextOfs()) = ofs; - e->lastRecord = dl; - return dl; - } - - // TODO: Directly assert the desired record store state instead of just walking it - void walkAndCount(int expectedCount) { - // Walk the collection going forward. - { - CappedRecordStoreV1Iterator cursor(&opCtx, &rs, /*forward=*/true); - int resultCount = 0; - while (auto record = cursor.next()) { - ++resultCount; - } - - ASSERT_EQUALS(resultCount, expectedCount); - } - - // Walk the collection going backwards. - { - CappedRecordStoreV1Iterator cursor(&opCtx, &rs, /*forward=*/false); - int resultCount = expectedCount; - while (auto record = cursor.next()) { - --resultCount; - } - - ASSERT_EQUALS(resultCount, 0); - } - } - - static const char* ns() { - return "unittests.QueryStageCollectionScanCapped"; - } - - OperationContextNoop opCtx; - DummyRecordStoreV1MetaData* md; - DummyExtentManager em; - -private: - DummyCappedCallback cb; - CappedRecordStoreV1 rs; -}; - - -TEST(CappedRecordStoreV1QueryStage, CollscanCappedBase) { - CollscanHelper h(1); - h.walkAndCount(0); -} - -TEST(CappedRecordStoreV1QueryStage, CollscanEmptyLooped) { - CollscanHelper h(1); - h.md->setCapFirstNewRecord(&h.opCtx, DiskLoc()); - h.walkAndCount(0); -} - -TEST(CappedRecordStoreV1QueryStage, CollscanEmptyMultiExtentLooped) { - CollscanHelper h(3); - h.md->setCapFirstNewRecord(&h.opCtx, DiskLoc()); - h.walkAndCount(0); -} - -TEST(CappedRecordStoreV1QueryStage, CollscanSingle) { - CollscanHelper h(1); - - h.md->setCapFirstNewRecord(&h.opCtx, h.insert(h.md->capExtent(), 0)); - h.walkAndCount(1); -} - -TEST(CappedRecordStoreV1QueryStage, CollscanNewCapFirst) { - CollscanHelper h(1); - DiskLoc x = h.insert(h.md->capExtent(), 0); - h.md->setCapFirstNewRecord(&h.opCtx, x); - h.insert(h.md->capExtent(), 1); - h.walkAndCount(2); -} - -TEST(CappedRecordStoreV1QueryStage, CollscanNewCapMiddle) { - CollscanHelper h(1); - h.insert(h.md->capExtent(), 0); - h.md->setCapFirstNewRecord(&h.opCtx, h.insert(h.md->capExtent(), 1)); - h.insert(h.md->capExtent(), 2); - h.walkAndCount(3); -} - -TEST(CappedRecordStoreV1QueryStage, CollscanFirstExtent) { - CollscanHelper h(2); - h.insert(h.md->capExtent(), 0); - h.insert(h.md->lastExtent(&h.opCtx), 1); - h.md->setCapFirstNewRecord(&h.opCtx, h.insert(h.md->capExtent(), 2)); - h.insert(h.md->capExtent(), 3); - h.walkAndCount(4); -} - -TEST(CappedRecordStoreV1QueryStage, CollscanLastExtent) { - CollscanHelper h(2); - h.md->setCapExtent(&h.opCtx, h.md->lastExtent(&h.opCtx)); - h.insert(h.md->capExtent(), 0); - h.insert(h.md->firstExtent(&h.opCtx), 1); - h.md->setCapFirstNewRecord(&h.opCtx, h.insert(h.md->capExtent(), 2)); - h.insert(h.md->capExtent(), 3); - h.walkAndCount(4); -} - -TEST(CappedRecordStoreV1QueryStage, CollscanMidExtent) { - CollscanHelper h(3); - h.md->setCapExtent(&h.opCtx, h.em.getExtent(h.md->firstExtent(&h.opCtx))->xnext); - h.insert(h.md->capExtent(), 0); - h.insert(h.md->lastExtent(&h.opCtx), 1); - h.insert(h.md->firstExtent(&h.opCtx), 2); - h.md->setCapFirstNewRecord(&h.opCtx, h.insert(h.md->capExtent(), 3)); - h.insert(h.md->capExtent(), 4); - h.walkAndCount(5); -} - -TEST(CappedRecordStoreV1QueryStage, CollscanAloneInExtent) { - CollscanHelper h(3); - h.md->setCapExtent(&h.opCtx, h.em.getExtent(h.md->firstExtent(&h.opCtx))->xnext); - h.insert(h.md->lastExtent(&h.opCtx), 0); - h.insert(h.md->firstExtent(&h.opCtx), 1); - h.md->setCapFirstNewRecord(&h.opCtx, h.insert(h.md->capExtent(), 2)); - h.walkAndCount(3); -} - -TEST(CappedRecordStoreV1QueryStage, CollscanFirstInExtent) { - CollscanHelper h(3); - h.md->setCapExtent(&h.opCtx, h.em.getExtent(h.md->firstExtent(&h.opCtx))->xnext); - h.insert(h.md->lastExtent(&h.opCtx), 0); - h.insert(h.md->firstExtent(&h.opCtx), 1); - h.md->setCapFirstNewRecord(&h.opCtx, h.insert(h.md->capExtent(), 2)); - h.insert(h.md->capExtent(), 3); - h.walkAndCount(4); -} - -TEST(CappedRecordStoreV1QueryStage, CollscanLastInExtent) { - CollscanHelper h(3); - h.md->setCapExtent(&h.opCtx, h.em.getExtent(h.md->firstExtent(&h.opCtx))->xnext); - h.insert(h.md->capExtent(), 0); - h.insert(h.md->lastExtent(&h.opCtx), 1); - h.insert(h.md->firstExtent(&h.opCtx), 2); - h.md->setCapFirstNewRecord(&h.opCtx, h.insert(h.md->capExtent(), 3)); - h.walkAndCount(4); -} -} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp deleted file mode 100644 index 872c29e112b..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.cpp +++ /dev/null @@ -1,210 +0,0 @@ -/** - * Copyright (C) 2014 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h" - -#include "mongo/db/catalog/collection.h" -#include "mongo/db/storage/mmap_v1/extent.h" -#include "mongo/db/storage/mmap_v1/extent_manager.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h" -#include "mongo/util/log.h" - -namespace mongo { - -using std::endl; - -RecordStoreV1RepairCursor::RecordStoreV1RepairCursor(OperationContext* opCtx, - const RecordStoreV1Base* recordStore) - : _opCtx(opCtx), _recordStore(recordStore), _stage(FORWARD_SCAN) { - // Position the iterator at the first record - // - advance(); -} - -boost::optional<Record> RecordStoreV1RepairCursor::next() { - if (_currRecord.isNull()) - return {}; - auto out = _currRecord.toRecordId(); - advance(); - return {{out, _recordStore->dataFor(_opCtx, out)}}; -} - -void RecordStoreV1RepairCursor::advance() { - const ExtentManager* em = _recordStore->_extentManager; - - while (true) { - if (_currRecord.isNull()) { - if (!_advanceToNextValidExtent()) { - return; - } - - _seenInCurrentExtent.clear(); - - // Otherwise _advanceToNextValidExtent would have returned false - // - invariant(!_currExtent.isNull()); - - const Extent* e = em->getExtent(_currExtent, false); - _currRecord = (FORWARD_SCAN == _stage ? e->firstRecord : e->lastRecord); - } else { - switch (_stage) { - case FORWARD_SCAN: - _currRecord = _recordStore->getNextRecordInExtent(_opCtx, _currRecord); - break; - case BACKWARD_SCAN: - _currRecord = _recordStore->getPrevRecordInExtent(_opCtx, _currRecord); - break; - default: - invariant(!"This should never be reached."); - break; - } - } - - if (_currRecord.isNull()) { - continue; - } - - // Validate the contents of the record's disk location and deduplicate - // - if (!_seenInCurrentExtent.insert(_currRecord).second) { - error() << "infinite loop in extent, seen: " << _currRecord << " before" << endl; - _currRecord = DiskLoc(); - continue; - } - - if (_currRecord.getOfs() <= 0) { - error() << "offset is 0 for record which should be impossible" << endl; - _currRecord = DiskLoc(); - continue; - } - - return; - } -} - -bool RecordStoreV1RepairCursor::_advanceToNextValidExtent() { - const ExtentManager* em = _recordStore->_extentManager; - - while (true) { - if (_currExtent.isNull()) { - switch (_stage) { - case FORWARD_SCAN: - _currExtent = _recordStore->details()->firstExtent(_opCtx); - break; - case BACKWARD_SCAN: - _currExtent = _recordStore->details()->lastExtent(_opCtx); - break; - default: - invariant(DONE == _stage); - return false; - } - } else { - // If _currExtent is not NULL, then it must point to a valid extent, so no extra - // checks here. - // - const Extent* e = em->getExtent(_currExtent, false); - _currExtent = (FORWARD_SCAN == _stage ? e->xnext : e->xprev); - } - - bool hasNextExtent = !_currExtent.isNull(); - - // Sanity checks for the extent's disk location - // - if (hasNextExtent && (!_currExtent.isValid() || (_currExtent.getOfs() < 0))) { - error() << "Invalid extent location: " << _currExtent << endl; - - // Switch the direction of scan - // - hasNextExtent = false; - } - - if (hasNextExtent) { - break; - } - - // Swap the direction of scan and loop again - // - switch (_stage) { - case FORWARD_SCAN: - _stage = BACKWARD_SCAN; - break; - case BACKWARD_SCAN: - _stage = DONE; - break; - default: - invariant(!"This should never be reached."); - break; - } - - _currExtent = DiskLoc(); - } - - - // Check _currExtent's contents for validity, but do not count is as failure if they - // don't check out. - // - const Extent* e = em->getExtent(_currExtent, false); - if (!e->isOk()) { - warning() << "Extent not ok magic: " << e->magic << " going to try to continue" << endl; - } - - log() << (FORWARD_SCAN == _stage ? "FORWARD" : "BACKWARD") << " Extent loc: " << _currExtent - << ", length: " << e->length << endl; - - return true; -} - -void RecordStoreV1RepairCursor::invalidate(OperationContext* opCtx, const RecordId& id) { - // If we see this record again it probably means it was reinserted rather than an infinite - // loop. If we do loop, we should quickly hit another seen record that hasn't been - // invalidated. - DiskLoc dl = DiskLoc::fromRecordId(id); - _seenInCurrentExtent.erase(dl); - - if (_currRecord == dl) { - // The DiskLoc being invalidated is also the one pointed at by this iterator. We - // advance the iterator so it's not pointing at invalid data. - // We don't worry about undoing invalidations on rollback here, as we shouldn't have - // concurrent writes that can rollback to a database we're trying to recover. - advance(); - - if (_currRecord == dl) { - // Even after advancing the iterator, we're still pointing at the DiskLoc being - // invalidated. This is expected when 'dl' is the last DiskLoc in the FORWARD scan, - // and the initial call to getNext() moves the iterator to the first loc in the - // BACKWARDS scan. - advance(); - } - - invariant(_currRecord != dl); - } -} - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h b/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h deleted file mode 100644 index d95683a7c42..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_store_v1_repair_iterator.h +++ /dev/null @@ -1,96 +0,0 @@ -/** - * Copyright (C) 2014 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include <set> - -#include "mongo/db/storage/mmap_v1/record_store_v1_base.h" -#include "mongo/db/storage/record_store.h" - -namespace mongo { - -/** - * This iterator will go over the collection twice - once going forward (first extent -> last - * extent) and once backwards in an attempt to salvage potentially corrupted or unreachable - * records. It is used by the mongodump --repair option. - */ -class RecordStoreV1RepairCursor final : public RecordCursor { -public: - RecordStoreV1RepairCursor(OperationContext* opCtx, const RecordStoreV1Base* recordStore); - - boost::optional<Record> next() final; - void invalidate(OperationContext* opCtx, const RecordId& dl); - void save() final {} - bool restore() final { - return true; - } - void detachFromOperationContext() final { - _opCtx = nullptr; - } - void reattachToOperationContext(OperationContext* opCtx) final { - _opCtx = opCtx; - } - - // Explicitly not supporting fetcherForNext(). The expected use case for this class is a - // special offline operation where there are no concurrent operations, so it would be better - // to take the pagefault inline with the operation. - -private: - void advance(); - - /** - * Based on the direction of scan, finds the next valid (un-corrupted) extent in the chain - * and sets _currExtent to point to that. - * - * @return true if valid extent was found (_currExtent will not be null) - * false otherwise and _currExtent will be null - */ - bool _advanceToNextValidExtent(); - - // transactional context for read locks. Not owned by us - OperationContext* _opCtx; - - // Reference to the owning RecordStore. The store must not be deleted while there are - // active iterators on it. - // - const RecordStoreV1Base* _recordStore; - - DiskLoc _currExtent; - DiskLoc _currRecord; - - enum Stage { FORWARD_SCAN = 0, BACKWARD_SCAN = 1, DONE = 2 }; - - Stage _stage; - - // Used to find cycles within an extent. Cleared after each extent has been processed. - // - std::set<DiskLoc> _seenInCurrentExtent; -}; - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp deleted file mode 100644 index fc30532ed31..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_store_v1_simple.cpp +++ /dev/null @@ -1,486 +0,0 @@ -// record_store_v1_simple.cpp - -/** - * Copyright (C) 2013-2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h" - -#include "mongo/base/counter.h" -#include "mongo/db/catalog/collection.h" -#include "mongo/db/client.h" -#include "mongo/db/commands/server_status_metric.h" -#include "mongo/db/curop.h" -#include "mongo/db/operation_context.h" -#include "mongo/db/storage/mmap_v1/extent.h" -#include "mongo/db/storage/mmap_v1/extent_manager.h" -#include "mongo/db/storage/mmap_v1/record.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h" -#include "mongo/db/storage/mmap_v1/touch_pages.h" -#include "mongo/stdx/memory.h" -#include "mongo/util/log.h" -#include "mongo/util/mongoutils/str.h" -#include "mongo/util/progress_meter.h" -#include "mongo/util/timer.h" - -namespace mongo { - -using std::endl; -using std::vector; - -static Counter64 freelistAllocs; -static Counter64 freelistBucketExhausted; -static Counter64 freelistIterations; - -// TODO figure out what to do about these. -static ServerStatusMetricField<Counter64> dFreelist1("storage.freelist.search.requests", - &freelistAllocs); - -static ServerStatusMetricField<Counter64> dFreelist2("storage.freelist.search.bucketExhausted", - &freelistBucketExhausted); - -static ServerStatusMetricField<Counter64> dFreelist3("storage.freelist.search.scanned", - &freelistIterations); - -SimpleRecordStoreV1::SimpleRecordStoreV1(OperationContext* opCtx, - StringData ns, - RecordStoreV1MetaData* details, - ExtentManager* em, - bool isSystemIndexes) - : RecordStoreV1Base(ns, details, em, isSystemIndexes) { - invariant(!details->isCapped()); - _normalCollection = NamespaceString::normal(ns); -} - -SimpleRecordStoreV1::~SimpleRecordStoreV1() {} - -DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents(OperationContext* opCtx, int lenToAllocRaw) { - // Slowly drain the deletedListLegacyGrabBag by popping one record off and putting it in the - // correct deleted list each time we try to allocate a new record. This ensures we won't - // orphan any data when upgrading from old versions, without needing a long upgrade phase. - // This is done before we try to allocate the new record so we can take advantage of the new - // space immediately. - { - const DiskLoc head = _details->deletedListLegacyGrabBag(); - if (!head.isNull()) { - _details->setDeletedListLegacyGrabBag(opCtx, drec(head)->nextDeleted()); - addDeletedRec(opCtx, head); - } - } - - // align size up to a multiple of 4 - const int lenToAlloc = (lenToAllocRaw + (4 - 1)) & ~(4 - 1); - - freelistAllocs.increment(); - DiskLoc loc; - DeletedRecord* dr = NULL; - { - int myBucket; - for (myBucket = bucket(lenToAlloc); myBucket < Buckets; myBucket++) { - // Only look at the first entry in each bucket. This works because we are either - // quantizing or allocating fixed-size blocks. - const DiskLoc head = _details->deletedListEntry(myBucket); - if (head.isNull()) - continue; - DeletedRecord* const candidate = drec(head); - if (candidate->lengthWithHeaders() >= lenToAlloc) { - loc = head; - dr = candidate; - break; - } - } - - if (!dr) - return DiskLoc(); // no space - - // Unlink ourself from the deleted list - _details->setDeletedListEntry(opCtx, myBucket, dr->nextDeleted()); - *opCtx->recoveryUnit()->writing(&dr->nextDeleted()) = DiskLoc().setInvalid(); // defensive - } - - invariant(dr->extentOfs() < loc.getOfs()); - - // Split the deleted record if it has at least as much left over space as our smallest - // allocation size. Otherwise, just take the whole DeletedRecord. - const int remainingLength = dr->lengthWithHeaders() - lenToAlloc; - if (remainingLength >= bucketSizes[0]) { - opCtx->recoveryUnit()->writingInt(dr->lengthWithHeaders()) = lenToAlloc; - const DiskLoc newDelLoc = DiskLoc(loc.a(), loc.getOfs() + lenToAlloc); - DeletedRecord* newDel = opCtx->recoveryUnit()->writing(drec(newDelLoc)); - newDel->extentOfs() = dr->extentOfs(); - newDel->lengthWithHeaders() = remainingLength; - newDel->nextDeleted().Null(); - - addDeletedRec(opCtx, newDelLoc); - } - - return loc; -} - -StatusWith<DiskLoc> SimpleRecordStoreV1::allocRecord(OperationContext* opCtx, - int lengthWithHeaders, - bool enforceQuota) { - if (lengthWithHeaders > MaxAllowedAllocation) { - return StatusWith<DiskLoc>( - ErrorCodes::InvalidLength, - str::stream() << "Attempting to allocate a record larger than maximum size: " - << lengthWithHeaders - << " > 16.5MB"); - } - - DiskLoc loc = _allocFromExistingExtents(opCtx, lengthWithHeaders); - if (!loc.isNull()) - return StatusWith<DiskLoc>(loc); - - LOG(1) << "allocating new extent"; - - increaseStorageSize( - opCtx, - _extentManager->followupSize(lengthWithHeaders, _details->lastExtentSize(opCtx)), - enforceQuota); - - loc = _allocFromExistingExtents(opCtx, lengthWithHeaders); - if (!loc.isNull()) { - // got on first try - return StatusWith<DiskLoc>(loc); - } - - log() << "warning: alloc() failed after allocating new extent. " - << "lengthWithHeaders: " << lengthWithHeaders - << " last extent size:" << _details->lastExtentSize(opCtx) << "; trying again"; - - for (int z = 0; z < 10 && lengthWithHeaders > _details->lastExtentSize(opCtx); z++) { - log() << "try #" << z << endl; - - increaseStorageSize( - opCtx, - _extentManager->followupSize(lengthWithHeaders, _details->lastExtentSize(opCtx)), - enforceQuota); - - loc = _allocFromExistingExtents(opCtx, lengthWithHeaders); - if (!loc.isNull()) - return StatusWith<DiskLoc>(loc); - } - - return StatusWith<DiskLoc>(ErrorCodes::InternalError, "cannot allocate space"); -} - -Status SimpleRecordStoreV1::truncate(OperationContext* opCtx) { - const DiskLoc firstExtLoc = _details->firstExtent(opCtx); - if (firstExtLoc.isNull() || !firstExtLoc.isValid()) { - // Already empty - return Status::OK(); - } - - // Free all extents except the first. - Extent* firstExt = _extentManager->getExtent(firstExtLoc); - if (!firstExt->xnext.isNull()) { - const DiskLoc extNextLoc = firstExt->xnext; - const DiskLoc oldLastExtLoc = _details->lastExtent(opCtx); - Extent* const nextExt = _extentManager->getExtent(extNextLoc); - - // Unlink other extents; - *opCtx->recoveryUnit()->writing(&nextExt->xprev) = DiskLoc(); - *opCtx->recoveryUnit()->writing(&firstExt->xnext) = DiskLoc(); - _details->setLastExtent(opCtx, firstExtLoc); - _details->setLastExtentSize(opCtx, firstExt->length); - - _extentManager->freeExtents(opCtx, extNextLoc, oldLastExtLoc); - } - - // Make the first (now only) extent a single large deleted record. - *opCtx->recoveryUnit()->writing(&firstExt->firstRecord) = DiskLoc(); - *opCtx->recoveryUnit()->writing(&firstExt->lastRecord) = DiskLoc(); - _details->orphanDeletedList(opCtx); - addDeletedRec(opCtx, _findFirstSpot(opCtx, firstExtLoc, firstExt)); - - // Make stats reflect that there are now no documents in this record store. - _details->setStats(opCtx, 0, 0); - - return Status::OK(); -} - -void SimpleRecordStoreV1::addDeletedRec(OperationContext* opCtx, const DiskLoc& dloc) { - DeletedRecord* d = drec(dloc); - - int b = bucket(d->lengthWithHeaders()); - *opCtx->recoveryUnit()->writing(&d->nextDeleted()) = _details->deletedListEntry(b); - _details->setDeletedListEntry(opCtx, b, dloc); -} - -std::unique_ptr<SeekableRecordCursor> SimpleRecordStoreV1::getCursor(OperationContext* opCtx, - bool forward) const { - return stdx::make_unique<SimpleRecordStoreV1Iterator>(opCtx, this, forward); -} - -vector<std::unique_ptr<RecordCursor>> SimpleRecordStoreV1::getManyCursors( - OperationContext* opCtx) const { - vector<std::unique_ptr<RecordCursor>> cursors; - const Extent* ext; - for (DiskLoc extLoc = details()->firstExtent(opCtx); !extLoc.isNull(); extLoc = ext->xnext) { - ext = _getExtent(opCtx, extLoc); - if (ext->firstRecord.isNull()) - continue; - cursors.push_back(stdx::make_unique<RecordStoreV1Base::IntraExtentIterator>( - opCtx, ext->firstRecord, this)); - } - - return cursors; -} - -class CompactDocWriter final : public DocWriter { -public: - /** - * param allocationSize - allocation size WITH header - */ - CompactDocWriter(const MmapV1RecordHeader* rec, unsigned dataSize, size_t allocationSize) - : _rec(rec), _dataSize(dataSize), _allocationSize(allocationSize) {} - - virtual ~CompactDocWriter() {} - - virtual void writeDocument(char* buf) const { - memcpy(buf, _rec->data(), _dataSize); - } - - virtual size_t documentSize() const { - return _allocationSize - MmapV1RecordHeader::HeaderSize; - } - - virtual bool addPadding() const { - return false; - } - -private: - const MmapV1RecordHeader* _rec; - size_t _dataSize; - size_t _allocationSize; -}; - -void SimpleRecordStoreV1::_compactExtent(OperationContext* opCtx, - const DiskLoc extentLoc, - int extentNumber, - RecordStoreCompactAdaptor* adaptor, - const CompactOptions* compactOptions, - CompactStats* stats) { - log() << "compact begin extent #" << extentNumber << " for namespace " << _ns << " " - << extentLoc; - - unsigned oldObjSize = 0; // we'll report what the old padding was - unsigned oldObjSizeWithPadding = 0; - - Extent* const sourceExtent = _extentManager->getExtent(extentLoc); - sourceExtent->assertOk(); - fassert(17437, sourceExtent->validates(extentLoc)); - - { - // The next/prev MmapV1RecordHeader pointers within the Extent might not be in order so we - // first page in the whole Extent sequentially. - // TODO benchmark on slow storage to verify this is measurably faster. - log() << "compact paging in len=" << sourceExtent->length / 1000000.0 << "MB" << endl; - Timer t; - size_t length = sourceExtent->length; - - touch_pages(reinterpret_cast<const char*>(sourceExtent), length); - int ms = t.millis(); - if (ms > 1000) - log() << "compact end paging in " << ms << "ms " - << sourceExtent->length / 1000000.0 / t.seconds() << "MB/sec" << endl; - } - - { - // Move each MmapV1RecordHeader out of this extent and insert it in to the "new" extents. - log() << "compact copying records" << endl; - long long totalNetSize = 0; - long long nrecords = 0; - DiskLoc nextSourceLoc = sourceExtent->firstRecord; - while (!nextSourceLoc.isNull()) { - opCtx->checkForInterrupt(); - - WriteUnitOfWork wunit(opCtx); - MmapV1RecordHeader* recOld = recordFor(nextSourceLoc); - RecordData oldData = recOld->toRecordData(); - nextSourceLoc = getNextRecordInExtent(opCtx, nextSourceLoc); - - if (compactOptions->validateDocuments && !adaptor->isDataValid(oldData)) { - // object is corrupt! - log() << "compact removing corrupt document!"; - stats->corruptDocuments++; - } else { - // How much data is in the record. Excludes padding and MmapV1RecordHeader headers. - const unsigned rawDataSize = adaptor->dataSize(oldData); - - nrecords++; - oldObjSize += rawDataSize; - oldObjSizeWithPadding += recOld->netLength(); - - // Allocation sizes include the headers and possibly some padding. - const unsigned minAllocationSize = rawDataSize + MmapV1RecordHeader::HeaderSize; - unsigned allocationSize = minAllocationSize; - switch (compactOptions->paddingMode) { - case CompactOptions::NONE: // default padding - if (shouldPadInserts()) { - allocationSize = quantizeAllocationSpace(minAllocationSize); - } - break; - - case CompactOptions::PRESERVE: // keep original padding - allocationSize = recOld->lengthWithHeaders(); - break; - - case CompactOptions::MANUAL: // user specified how much padding to use - allocationSize = compactOptions->computeRecordSize(minAllocationSize); - if (allocationSize < minAllocationSize || - allocationSize > BSONObjMaxUserSize / 2) { - allocationSize = minAllocationSize; - } - break; - } - invariant(allocationSize >= minAllocationSize); - - // Copy the data to a new record. Because we orphaned the record freelist at the - // start of the compact, this insert will allocate a record in a new extent. - // See the comment in compact() for more details. - CompactDocWriter writer(recOld, rawDataSize, allocationSize); - StatusWith<RecordId> status = - insertRecordWithDocWriter(opCtx, &writer, Timestamp()); - uassertStatusOK(status.getStatus()); - const MmapV1RecordHeader* newRec = - recordFor(DiskLoc::fromRecordId(status.getValue())); - invariant(unsigned(newRec->netLength()) >= rawDataSize); - totalNetSize += newRec->netLength(); - - // Tells the caller that the record has been moved, so it can do things such as - // add it to indexes. - adaptor->inserted(newRec->toRecordData(), status.getValue()); - } - - // Remove the old record from the linked list of records withing the sourceExtent. - // The old record is not added to the freelist as we will be freeing the whole - // extent at the end. - *opCtx->recoveryUnit()->writing(&sourceExtent->firstRecord) = nextSourceLoc; - if (nextSourceLoc.isNull()) { - // Just moved the last record out of the extent. Mark extent as empty. - *opCtx->recoveryUnit()->writing(&sourceExtent->lastRecord) = DiskLoc(); - } else { - MmapV1RecordHeader* newFirstRecord = recordFor(nextSourceLoc); - opCtx->recoveryUnit()->writingInt(newFirstRecord->prevOfs()) = DiskLoc::NullOfs; - } - - // Adjust the stats to reflect the removal of the old record. The insert above - // handled adjusting the stats for the new record. - _details->incrementStats(opCtx, -(recOld->netLength()), -1); - - wunit.commit(); - } - - // The extent must now be empty. - invariant(sourceExtent->firstRecord.isNull()); - invariant(sourceExtent->lastRecord.isNull()); - - // We are still the first extent, but we must not be the only extent. - invariant(_details->firstExtent(opCtx) == extentLoc); - invariant(_details->lastExtent(opCtx) != extentLoc); - - // Remove the newly emptied sourceExtent from the extent linked list and return it to - // the extent manager. - WriteUnitOfWork wunit(opCtx); - const DiskLoc newFirst = sourceExtent->xnext; - _details->setFirstExtent(opCtx, newFirst); - *opCtx->recoveryUnit()->writing(&_extentManager->getExtent(newFirst)->xprev) = DiskLoc(); - _extentManager->freeExtent(opCtx, extentLoc); - wunit.commit(); - - { - const double oldPadding = oldObjSize ? double(oldObjSizeWithPadding) / oldObjSize - : 1.0; // defining 0/0 as 1 for this. - - log() << "compact finished extent #" << extentNumber << " containing " << nrecords - << " documents (" << totalNetSize / (1024 * 1024.0) << "MB)" - << " oldPadding: " << oldPadding; - } - } -} - -Status SimpleRecordStoreV1::compact(OperationContext* opCtx, - RecordStoreCompactAdaptor* adaptor, - const CompactOptions* options, - CompactStats* stats) { - std::vector<DiskLoc> extents; - for (DiskLoc extLocation = _details->firstExtent(opCtx); !extLocation.isNull(); - extLocation = _extentManager->getExtent(extLocation)->xnext) { - extents.push_back(extLocation); - } - log() << "compact " << extents.size() << " extents"; - - { - WriteUnitOfWork wunit(opCtx); - // Orphaning the deleted lists ensures that all inserts go to new extents rather than - // the ones that existed before starting the compact. If we abort the operation before - // completion, any free space in the old extents will be leaked and never reused unless - // the collection is compacted again or dropped. This is considered an acceptable - // failure mode as no data will be lost. - log() << "compact orphan deleted lists" << endl; - _details->orphanDeletedList(opCtx); - - // Start over from scratch with our extent sizing and growth - _details->setLastExtentSize(opCtx, 0); - - // create a new extent so new records go there - const bool enforceQuota = false; - increaseStorageSize(opCtx, _details->lastExtentSize(opCtx), enforceQuota); - wunit.commit(); - } - - stdx::unique_lock<Client> lk(*opCtx->getClient()); - ProgressMeterHolder pm(CurOp::get(opCtx)->setMessage_inlock( - "compact extent", "Extent Compacting Progress", extents.size())); - lk.unlock(); - - // Go through all old extents and move each record to a new set of extents. - int extentNumber = 0; - for (std::vector<DiskLoc>::iterator it = extents.begin(); it != extents.end(); it++) { - opCtx->checkForInterrupt(); - invariant(_details->firstExtent(opCtx) == *it); - // empties and removes the first extent - _compactExtent(opCtx, *it, extentNumber++, adaptor, options, stats); - invariant(_details->firstExtent(opCtx) != *it); - pm.hit(); - } - - invariant(_extentManager->getExtent(_details->firstExtent(opCtx))->xprev.isNull()); - invariant(_extentManager->getExtent(_details->lastExtent(opCtx))->xnext.isNull()); - - // indexes will do their own progress meter - pm.finished(); - - return Status::OK(); -} -} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple.h b/src/mongo/db/storage/mmap_v1/record_store_v1_simple.h deleted file mode 100644 index 61c04bbf420..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_store_v1_simple.h +++ /dev/null @@ -1,106 +0,0 @@ -// record_store_v1_simple.h - -/** -* Copyright (C) 2013-2014 MongoDB Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include "mongo/db/catalog/collection_options.h" -#include "mongo/db/storage/mmap_v1/diskloc.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_base.h" - -namespace mongo { - -class SimpleRecordStoreV1Cursor; - -// used by index and original collections -class SimpleRecordStoreV1 : public RecordStoreV1Base { -public: - SimpleRecordStoreV1(OperationContext* opCtx, - StringData ns, - RecordStoreV1MetaData* details, - ExtentManager* em, - bool isSystemIndexes); - - virtual ~SimpleRecordStoreV1(); - - const char* name() const { - return "SimpleRecordStoreV1"; - } - - std::unique_ptr<SeekableRecordCursor> getCursor(OperationContext* opCtx, - bool forward) const final; - - std::vector<std::unique_ptr<RecordCursor>> getManyCursors(OperationContext* opCtx) const final; - - virtual Status truncate(OperationContext* opCtx); - - virtual void cappedTruncateAfter(OperationContext* opCtx, RecordId end, bool inclusive) { - invariant(!"cappedTruncateAfter not supported"); - } - - virtual bool compactSupported() const { - return true; - } - virtual bool compactsInPlace() const { - return false; - } - virtual Status compact(OperationContext* opCtx, - RecordStoreCompactAdaptor* adaptor, - const CompactOptions* options, - CompactStats* stats); - -protected: - virtual bool isCapped() const { - return false; - } - virtual bool shouldPadInserts() const { - return !_details->isUserFlagSet(CollectionOptions::Flag_NoPadding); - } - - virtual StatusWith<DiskLoc> allocRecord(OperationContext* opCtx, - int lengthWithHeaders, - bool enforceQuota); - - virtual void addDeletedRec(OperationContext* opCtx, const DiskLoc& dloc); - -private: - DiskLoc _allocFromExistingExtents(OperationContext* opCtx, int lengthWithHeaders); - - void _compactExtent(OperationContext* opCtx, - const DiskLoc diskloc, - int extentNumber, - RecordStoreCompactAdaptor* adaptor, - const CompactOptions* compactOptions, - CompactStats* stats); - - bool _normalCollection; - - friend class SimpleRecordStoreV1Iterator; -}; -} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp deleted file mode 100644 index 414e1016a6b..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/** - * Copyright (C) 2013 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h" - -#include "mongo/db/catalog/collection.h" -#include "mongo/db/storage/mmap_v1/extent.h" -#include "mongo/db/storage/mmap_v1/extent_manager.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h" - -namespace mongo { - -// -// Regular / non-capped collection traversal -// - -SimpleRecordStoreV1Iterator::SimpleRecordStoreV1Iterator(OperationContext* opCtx, - const SimpleRecordStoreV1* collection, - bool forward) - : _opCtx(opCtx), _recordStore(collection), _forward(forward) { - // Eagerly seek to first Record on creation since it is cheap. - const ExtentManager* em = _recordStore->_extentManager; - if (_recordStore->details()->firstExtent(opCtx).isNull()) { - // nothing in the collection - verify(_recordStore->details()->lastExtent(opCtx).isNull()); - } else if (_forward) { - // Find a non-empty extent and start with the first record in it. - Extent* e = em->getExtent(_recordStore->details()->firstExtent(opCtx)); - - while (e->firstRecord.isNull() && !e->xnext.isNull()) { - e = em->getExtent(e->xnext); - } - - // _curr may be set to DiskLoc() here if e->lastRecord isNull but there is no - // valid e->xnext - _curr = e->firstRecord; - } else { - // Walk backwards, skipping empty extents, and use the last record in the first - // non-empty extent we see. - Extent* e = em->getExtent(_recordStore->details()->lastExtent(opCtx)); - - // TODO ELABORATE - // Does one of e->lastRecord.isNull(), e.firstRecord.isNull() imply the other? - while (e->lastRecord.isNull() && !e->xprev.isNull()) { - e = em->getExtent(e->xprev); - } - - // _curr may be set to DiskLoc() here if e->lastRecord isNull but there is no - // valid e->xprev - _curr = e->lastRecord; - } -} - -boost::optional<Record> SimpleRecordStoreV1Iterator::next() { - if (isEOF()) - return {}; - auto toReturn = _curr.toRecordId(); - advance(); - return {{toReturn, _recordStore->RecordStore::dataFor(_opCtx, toReturn)}}; -} - -boost::optional<Record> SimpleRecordStoreV1Iterator::seekExact(const RecordId& id) { - _curr = DiskLoc::fromRecordId(id); - advance(); - return {{id, _recordStore->RecordStore::dataFor(_opCtx, id)}}; -} - -void SimpleRecordStoreV1Iterator::advance() { - // Move to the next thing. - if (!isEOF()) { - if (_forward) { - _curr = _recordStore->getNextRecord(_opCtx, _curr); - } else { - _curr = _recordStore->getPrevRecord(_opCtx, _curr); - } - } -} - -void SimpleRecordStoreV1Iterator::invalidate(OperationContext* opCtx, const RecordId& dl) { - // Just move past the thing being deleted. - if (dl == _curr.toRecordId()) { - const DiskLoc origLoc = _curr; - - // Undo the advance on rollback, as the deletion that forced it "never happened". - opCtx->recoveryUnit()->onRollback([this, origLoc]() { this->_curr = origLoc; }); - advance(); - } -} - -void SimpleRecordStoreV1Iterator::save() {} - -bool SimpleRecordStoreV1Iterator::restore() { - // if the collection is dropped, then the cursor should be destroyed - return true; -} - -std::unique_ptr<RecordFetcher> SimpleRecordStoreV1Iterator::fetcherForNext() const { - return _recordStore->_extentManager->recordNeedsFetch(_curr); -} - -std::unique_ptr<RecordFetcher> SimpleRecordStoreV1Iterator::fetcherForId(const RecordId& id) const { - return _recordStore->_extentManager->recordNeedsFetch(DiskLoc::fromRecordId(id)); -} -} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h deleted file mode 100644 index dd54877ee93..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_iterator.h +++ /dev/null @@ -1,79 +0,0 @@ -/** - * Copyright (C) 2013 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/db/storage/mmap_v1/diskloc.h" -#include "mongo/db/storage/record_store.h" - -namespace mongo { - -class SimpleRecordStoreV1; - -/** - * This class iterates over a non-capped collection identified by 'ns'. - * The collection must exist when the constructor is called. - * - * If start is not DiskLoc(), the iteration begins at that DiskLoc. - */ -class SimpleRecordStoreV1Iterator final : public SeekableRecordCursor { -public: - SimpleRecordStoreV1Iterator(OperationContext* opCtx, - const SimpleRecordStoreV1* records, - bool forward); - - boost::optional<Record> next() final; - boost::optional<Record> seekExact(const RecordId& id) final; - void save() final; - bool restore() final; - void detachFromOperationContext() final { - _opCtx = nullptr; - } - void reattachToOperationContext(OperationContext* opCtx) final { - _opCtx = opCtx; - } - void invalidate(OperationContext* opCtx, const RecordId& dl) final; - std::unique_ptr<RecordFetcher> fetcherForNext() const final; - std::unique_ptr<RecordFetcher> fetcherForId(const RecordId& id) const final; - -private: - void advance(); - bool isEOF() { - return _curr.isNull(); - } - - // for getNext, not owned - OperationContext* _opCtx; - - // The result returned on the next call to getNext(). - DiskLoc _curr; - const SimpleRecordStoreV1* const _recordStore; - const bool _forward; -}; - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp deleted file mode 100644 index d1b3cb5c234..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_store_v1_simple_test.cpp +++ /dev/null @@ -1,468 +0,0 @@ -// record_store_v1_simple_test.cpp - -/** - * Copyright (C) 2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h" - -#include "mongo/db/operation_context_noop.h" -#include "mongo/db/storage/mmap_v1/extent.h" -#include "mongo/db/storage/mmap_v1/record.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h" -#include "mongo/unittest/unittest.h" - -using namespace mongo; - -namespace { - -using std::string; - -TEST(SimpleRecordStoreV1, quantizeAllocationSpaceSimple) { - ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(33), 64); - ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1000), 1024); - ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(10001), 16 * 1024); - ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(100000), 128 * 1024); - ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1000001), 1024 * 1024); - ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(10000000), 10 * 1024 * 1024); - ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(14 * 1024 * 1024 - 1), - 14 * 1024 * 1024); - ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(14 * 1024 * 1024), 14 * 1024 * 1024); - ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(14 * 1024 * 1024 + 1), - 16 * 1024 * 1024 + 512 * 1024); - ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(16 * 1024 * 1024 + 512 * 1024), - 16 * 1024 * 1024 + 512 * 1024); -} - -TEST(SimpleRecordStoreV1, quantizeAllocationMinMaxBound) { - const int maxSize = RecordStoreV1Base::MaxAllowedAllocation; - ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(1), 32); - ASSERT_EQUALS(RecordStoreV1Base::quantizeAllocationSpace(maxSize), maxSize); -} - -/** - * Tests quantization of sizes around all valid bucket sizes. - */ -TEST(SimpleRecordStoreV1, quantizeAroundBucketSizes) { - for (int bucket = 0; bucket < RecordStoreV1Base::Buckets - 2; bucket++) { - const int size = RecordStoreV1Base::bucketSizes[bucket]; - const int nextSize = RecordStoreV1Base::bucketSizes[bucket + 1]; - - // size - 1 is quantized to size. - ASSERT_EQUALS(size, RecordStoreV1Base::quantizeAllocationSpace(size - 1)); - - // size is quantized to size. - ASSERT_EQUALS(size, RecordStoreV1Base::quantizeAllocationSpace(size)); - - // size + 1 is quantized to nextSize (if it is a valid allocation) - if (size + 1 <= RecordStoreV1Base::MaxAllowedAllocation) { - ASSERT_EQUALS(nextSize, RecordStoreV1Base::quantizeAllocationSpace(size + 1)); - } - } -} - -BSONObj docForRecordSize(int size) { - BSONObjBuilder b; - b.append("_id", 5); - b.append("x", string(size - MmapV1RecordHeader::HeaderSize - 22, 'x')); - BSONObj x = b.obj(); - ASSERT_EQUALS(MmapV1RecordHeader::HeaderSize + x.objsize(), size); - return x; -} - -class BsonDocWriter final : public DocWriter { -public: - BsonDocWriter(const BSONObj& obj, bool padding) : _obj(obj), _padding(padding) {} - - virtual void writeDocument(char* buf) const { - memcpy(buf, _obj.objdata(), _obj.objsize()); - } - virtual size_t documentSize() const { - return _obj.objsize(); - } - virtual bool addPadding() const { - return _padding; - } - -private: - BSONObj _obj; - bool _padding; -}; - -/** alloc() quantizes the requested size using quantizeAllocationSpace() rules. */ -TEST(SimpleRecordStoreV1, AllocQuantized) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0); - - string myns = "test.AllocQuantized"; - SimpleRecordStoreV1 rs(&opCtx, myns, md, &em, false); - - BSONObj obj = docForRecordSize(300); - StatusWith<RecordId> result = - rs.insertRecord(&opCtx, obj.objdata(), obj.objsize(), Timestamp(), false); - ASSERT(result.isOK()); - - // The length of the allocated record is quantized. - ASSERT_EQUALS(512, - rs.dataFor(&opCtx, result.getValue()).size() + MmapV1RecordHeader::HeaderSize); -} - -TEST(SimpleRecordStoreV1, AllocNonQuantized) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0); - md->setUserFlag(&opCtx, CollectionOptions::Flag_NoPadding); - - string myns = "test.AllocQuantized"; - SimpleRecordStoreV1 rs(&opCtx, myns, md, &em, false); - - BSONObj obj = docForRecordSize(300); - StatusWith<RecordId> result = - rs.insertRecord(&opCtx, obj.objdata(), obj.objsize(), Timestamp(), false); - ASSERT(result.isOK()); - - // The length of the allocated record is quantized. - ASSERT_EQUALS(300, - rs.dataFor(&opCtx, result.getValue()).size() + MmapV1RecordHeader::HeaderSize); -} - -TEST(SimpleRecordStoreV1, AllocNonQuantizedStillAligned) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0); - md->setUserFlag(&opCtx, CollectionOptions::Flag_NoPadding); - - string myns = "test.AllocQuantized"; - SimpleRecordStoreV1 rs(&opCtx, myns, md, &em, false); - - BSONObj obj = docForRecordSize(298); - StatusWith<RecordId> result = - rs.insertRecord(&opCtx, obj.objdata(), obj.objsize(), Timestamp(), false); - ASSERT(result.isOK()); - - // The length of the allocated record is quantized. - ASSERT_EQUALS(300, - rs.dataFor(&opCtx, result.getValue()).size() + MmapV1RecordHeader::HeaderSize); -} - -/** alloc() quantizes the requested size if DocWriter::addPadding() returns true. */ -TEST(SimpleRecordStoreV1, AllocQuantizedWithDocWriter) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0); - - string myns = "test.AllocQuantized"; - SimpleRecordStoreV1 rs(&opCtx, myns, md, &em, false); - - BsonDocWriter docWriter(docForRecordSize(300), true); - StatusWith<RecordId> result = rs.insertRecordWithDocWriter(&opCtx, &docWriter, Timestamp()); - ASSERT(result.isOK()); - - // The length of the allocated record is quantized. - ASSERT_EQUALS(512, - rs.dataFor(&opCtx, result.getValue()).size() + MmapV1RecordHeader::HeaderSize); -} - -/** - * alloc() does not quantize records if DocWriter::addPadding() returns false - */ -TEST(SimpleRecordStoreV1, AllocNonQuantizedDocWriter) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0); - - string myns = "test.AllocIndexNamespaceNotQuantized"; - SimpleRecordStoreV1 rs(&opCtx, myns + "$x", md, &em, false); - - BsonDocWriter docWriter(docForRecordSize(300), false); - StatusWith<RecordId> result = rs.insertRecordWithDocWriter(&opCtx, &docWriter, Timestamp()); - ASSERT(result.isOK()); - - // The length of the allocated record is not quantized. - ASSERT_EQUALS(300, - rs.dataFor(&opCtx, result.getValue()).size() + MmapV1RecordHeader::HeaderSize); -} - -/** alloc() aligns record sizes up to 4 bytes even if DocWriter::addPadding returns false. */ -TEST(SimpleRecordStoreV1, AllocAlignedDocWriter) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0); - - string myns = "test.AllocIndexNamespaceNotQuantized"; - SimpleRecordStoreV1 rs(&opCtx, myns + "$x", md, &em, false); - - BsonDocWriter docWriter(docForRecordSize(298), false); - StatusWith<RecordId> result = rs.insertRecordWithDocWriter(&opCtx, &docWriter, Timestamp()); - ASSERT(result.isOK()); - - ASSERT_EQUALS(300, - rs.dataFor(&opCtx, result.getValue()).size() + MmapV1RecordHeader::HeaderSize); -} -/** - * alloc() with quantized size doesn't split if enough room left over. - */ -TEST(SimpleRecordStoreV1, AllocUseQuantizedDeletedRecordWithoutSplit) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0); - SimpleRecordStoreV1 rs(&opCtx, "test.foo", md, &em, false); - - { - LocAndSize drecs[] = {{DiskLoc(0, 1000), 512 + 31}, {}}; - initializeV1RS(&opCtx, NULL, drecs, NULL, &em, md); - } - - BsonDocWriter docWriter(docForRecordSize(300), true); - StatusWith<RecordId> actualLocation = - rs.insertRecordWithDocWriter(&opCtx, &docWriter, Timestamp()); - ASSERT_OK(actualLocation.getStatus()); - - { - LocAndSize recs[] = {{DiskLoc(0, 1000), 512 + 31}, {}}; - LocAndSize drecs[] = {{}}; - assertStateV1RS(&opCtx, recs, drecs, NULL, &em, md); - } -} - -/** - * alloc() with quantized size splits if enough room left over. - */ -TEST(SimpleRecordStoreV1, AllocUseQuantizedDeletedRecordWithSplit) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0); - SimpleRecordStoreV1 rs(&opCtx, "test.foo", md, &em, false); - - { - LocAndSize drecs[] = {{DiskLoc(0, 1000), 512 + 32}, {}}; - initializeV1RS(&opCtx, NULL, drecs, NULL, &em, md); - } - - BsonDocWriter docWriter(docForRecordSize(300), true); - StatusWith<RecordId> actualLocation = - rs.insertRecordWithDocWriter(&opCtx, &docWriter, Timestamp()); - ASSERT_OK(actualLocation.getStatus()); - - { - LocAndSize recs[] = {{DiskLoc(0, 1000), 512}, {}}; - LocAndSize drecs[] = {{DiskLoc(0, 1512), 32}, {}}; - assertStateV1RS(&opCtx, recs, drecs, NULL, &em, md); - } -} - -/** - * alloc() with non quantized size doesn't split if enough room left over. - */ -TEST(SimpleRecordStoreV1, AllocUseNonQuantizedDeletedRecordWithoutSplit) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0); - SimpleRecordStoreV1 rs(&opCtx, "test.foo", md, &em, false); - - { - LocAndSize drecs[] = {{DiskLoc(0, 1000), 331}, {}}; - initializeV1RS(&opCtx, NULL, drecs, NULL, &em, md); - } - - BsonDocWriter docWriter(docForRecordSize(300), false); - StatusWith<RecordId> actualLocation = - rs.insertRecordWithDocWriter(&opCtx, &docWriter, Timestamp()); - ASSERT_OK(actualLocation.getStatus()); - - { - LocAndSize recs[] = {{DiskLoc(0, 1000), 331}, {}}; - LocAndSize drecs[] = {{}}; - assertStateV1RS(&opCtx, recs, drecs, NULL, &em, md); - } -} - -/** - * alloc() with non quantized size splits if enough room left over. - */ -TEST(SimpleRecordStoreV1, AllocUseNonQuantizedDeletedRecordWithSplit) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0); - SimpleRecordStoreV1 rs(&opCtx, "test.foo", md, &em, false); - - { - LocAndSize drecs[] = {{DiskLoc(0, 1000), 332}, {}}; - initializeV1RS(&opCtx, NULL, drecs, NULL, &em, md); - } - - BsonDocWriter docWriter(docForRecordSize(300), false); - StatusWith<RecordId> actualLocation = - rs.insertRecordWithDocWriter(&opCtx, &docWriter, Timestamp()); - ASSERT_OK(actualLocation.getStatus()); - - { - LocAndSize recs[] = {{DiskLoc(0, 1000), 300}, {}}; - LocAndSize drecs[] = {{DiskLoc(0, 1300), 32}, {}}; - assertStateV1RS(&opCtx, recs, drecs, NULL, &em, md); - } -} - -/** - * alloc() will use from the legacy grab bag if it can. - */ -TEST(SimpleRecordStoreV1, GrabBagIsUsed) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0); - SimpleRecordStoreV1 rs(&opCtx, "test.foo", md, &em, false); - - { - LocAndSize drecs[] = {{}}; - LocAndSize grabBag[] = { - {DiskLoc(0, 1000), 4 * 1024 * 1024}, {DiskLoc(1, 1000), 4 * 1024 * 1024}, {}}; - initializeV1RS(&opCtx, NULL, drecs, grabBag, &em, md); - } - - BsonDocWriter docWriter(docForRecordSize(256), false); - StatusWith<RecordId> actualLocation = - rs.insertRecordWithDocWriter(&opCtx, &docWriter, Timestamp()); - ASSERT_OK(actualLocation.getStatus()); - - { - LocAndSize recs[] = {{DiskLoc(0, 1000), 256}, {}}; - LocAndSize drecs[] = {{DiskLoc(0, 1256), 4 * 1024 * 1024 - 256}, {}}; - LocAndSize grabBag[] = {{DiskLoc(1, 1000), 4 * 1024 * 1024}, {}}; - assertStateV1RS(&opCtx, recs, drecs, grabBag, &em, md); - } -} - -/** - * alloc() will pull from the legacy grab bag even if it isn't needed. - */ -TEST(SimpleRecordStoreV1, GrabBagIsPoppedEvenIfUnneeded) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0); - SimpleRecordStoreV1 rs(&opCtx, "test.foo", md, &em, false); - - { - LocAndSize drecs[] = {{DiskLoc(0, 1000), 1000}, {}}; - LocAndSize grabBag[] = { - {DiskLoc(1, 1000), 4 * 1024 * 1024}, {DiskLoc(2, 1000), 4 * 1024 * 1024}, {}}; - initializeV1RS(&opCtx, NULL, drecs, grabBag, &em, md); - } - - BsonDocWriter docWriter(docForRecordSize(1000), false); - StatusWith<RecordId> actualLocation = - rs.insertRecordWithDocWriter(&opCtx, &docWriter, Timestamp()); - ASSERT_OK(actualLocation.getStatus()); - - { - LocAndSize recs[] = {{DiskLoc(0, 1000), 1000}, {}}; - LocAndSize drecs[] = {{DiskLoc(1, 1000), 4 * 1024 * 1024}, {}}; - LocAndSize grabBag[] = {{DiskLoc(2, 1000), 4 * 1024 * 1024}, {}}; - assertStateV1RS(&opCtx, recs, drecs, grabBag, &em, md); - } -} - -/** - * alloc() will pull from the legacy grab bag even if it can't be used - */ -TEST(SimpleRecordStoreV1, GrabBagIsPoppedEvenIfUnusable) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0); - SimpleRecordStoreV1 rs(&opCtx, "test.foo", md, &em, false); - - { - LocAndSize drecs[] = {{DiskLoc(0, 1000), 8 * 1024 * 1024}, {}}; - LocAndSize grabBag[] = { - {DiskLoc(1, 1000), 4 * 1024 * 1024}, {DiskLoc(2, 1000), 4 * 1024 * 1024}, {}}; - initializeV1RS(&opCtx, NULL, drecs, grabBag, &em, md); - } - - BsonDocWriter docWriter(docForRecordSize(8 * 1024 * 1024), false); - StatusWith<RecordId> actualLocation = - rs.insertRecordWithDocWriter(&opCtx, &docWriter, Timestamp()); - ASSERT_OK(actualLocation.getStatus()); - - { - LocAndSize recs[] = {{DiskLoc(0, 1000), 8 * 1024 * 1024}, {}}; - LocAndSize drecs[] = {{DiskLoc(1, 1000), 4 * 1024 * 1024}, {}}; - LocAndSize grabBag[] = {{DiskLoc(2, 1000), 4 * 1024 * 1024}, {}}; - assertStateV1RS(&opCtx, recs, drecs, grabBag, &em, md); - } -} - -// ----------------- - -TEST(SimpleRecordStoreV1, FullSimple1) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0); - SimpleRecordStoreV1 rs(&opCtx, "test.foo", md, &em, false); - - - ASSERT_EQUALS(0, md->numRecords()); - StatusWith<RecordId> result = rs.insertRecord(&opCtx, "abc", 4, Timestamp(), true); - ASSERT_TRUE(result.isOK()); - ASSERT_EQUALS(1, md->numRecords()); - RecordData recordData = rs.dataFor(&opCtx, result.getValue()); - ASSERT_EQUALS(string("abc"), string(recordData.data())); -} - -// ----------------- - -TEST(SimpleRecordStoreV1, Truncate) { - OperationContextNoop opCtx; - DummyExtentManager em; - DummyRecordStoreV1MetaData* md = new DummyRecordStoreV1MetaData(false, 0); - SimpleRecordStoreV1 rs(&opCtx, "test.foo", md, &em, false); - - { - LocAndSize recs[] = {{DiskLoc(0, 1000), 100}, - {DiskLoc(0, 1100), 100}, - {DiskLoc(0, 1300), 100}, - {DiskLoc(2, 1100), 100}, - {}}; - LocAndSize drecs[] = { - {DiskLoc(0, 1200), 100}, {DiskLoc(2, 1000), 100}, {DiskLoc(1, 1000), 1000}, {}}; - - initializeV1RS(&opCtx, recs, drecs, NULL, &em, md); - - ASSERT_EQUALS(em.getExtent(DiskLoc(0, 0))->length, em.minSize()); - } - - rs.truncate(&opCtx).transitional_ignore(); - - { - LocAndSize recs[] = {{}}; - LocAndSize drecs[] = { - // One extent filled with a single deleted record. - {DiskLoc(0, Extent::HeaderSize()), em.minSize() - Extent::HeaderSize()}, - {}}; - assertStateV1RS(&opCtx, recs, drecs, NULL, &em, md); - } -} -} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp deleted file mode 100644 index 8c55c72301b..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.cpp +++ /dev/null @@ -1,668 +0,0 @@ -// record_store_v1_test_help.cpp - -/** -* Copyright (C) 2014 MongoDB Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/db/storage/mmap_v1/record_store_v1_test_help.h" - -#include <algorithm> -#include <boost/next_prior.hpp> -#include <map> -#include <set> -#include <vector> - -#include "mongo/db/storage/mmap_v1/extent.h" -#include "mongo/db/storage/mmap_v1/record.h" -#include "mongo/db/storage/record_fetcher.h" -#include "mongo/unittest/unittest.h" -#include "mongo/util/allocator.h" -#include "mongo/util/log.h" - -namespace mongo { - -using std::numeric_limits; - -DummyRecordStoreV1MetaData::DummyRecordStoreV1MetaData(bool capped, int userFlags) { - _dataSize = 0; - _numRecords = 0; - _capped = capped; - _userFlags = userFlags; - _lastExtentSize = 0; - _paddingFactor = 1; - _maxCappedDocs = numeric_limits<long long>::max(); - _capFirstNewRecord.setInvalid(); - if (_capped) { - // copied from NamespaceDetails::NamespaceDetails() - setDeletedListEntry(NULL, 1, DiskLoc().setInvalid()); - } -} - -const DiskLoc& DummyRecordStoreV1MetaData::capExtent() const { - return _capExtent; -} - -void DummyRecordStoreV1MetaData::setCapExtent(OperationContext* opCtx, const DiskLoc& loc) { - _capExtent = loc; -} - -const DiskLoc& DummyRecordStoreV1MetaData::capFirstNewRecord() const { - return _capFirstNewRecord; -} - -void DummyRecordStoreV1MetaData::setCapFirstNewRecord(OperationContext* opCtx, const DiskLoc& loc) { - _capFirstNewRecord = loc; -} - -long long DummyRecordStoreV1MetaData::dataSize() const { - return _dataSize; -} - -long long DummyRecordStoreV1MetaData::numRecords() const { - return _numRecords; -} - -void DummyRecordStoreV1MetaData::incrementStats(OperationContext* opCtx, - long long dataSizeIncrement, - long long numRecordsIncrement) { - _dataSize += dataSizeIncrement; - _numRecords += numRecordsIncrement; -} - -void DummyRecordStoreV1MetaData::setStats(OperationContext* opCtx, - long long dataSize, - long long numRecords) { - _dataSize = dataSize; - _numRecords = numRecords; -} - -namespace { -DiskLoc myNull; -} - -DiskLoc DummyRecordStoreV1MetaData::deletedListEntry(int bucket) const { - invariant(bucket >= 0); - if (static_cast<size_t>(bucket) >= _deletedLists.size()) - return myNull; - return _deletedLists[bucket]; -} - -void DummyRecordStoreV1MetaData::setDeletedListEntry(OperationContext* opCtx, - int bucket, - const DiskLoc& loc) { - invariant(bucket >= 0); - invariant(bucket < 1000); - while (static_cast<size_t>(bucket) >= _deletedLists.size()) - _deletedLists.push_back(DiskLoc()); - _deletedLists[bucket] = loc; -} - -DiskLoc DummyRecordStoreV1MetaData::deletedListLegacyGrabBag() const { - return _deletedListLegacyGrabBag; -} - -void DummyRecordStoreV1MetaData::setDeletedListLegacyGrabBag(OperationContext* opCtx, - const DiskLoc& loc) { - _deletedListLegacyGrabBag = loc; -} - -void DummyRecordStoreV1MetaData::orphanDeletedList(OperationContext* opCtx) { - // They will be recreated on demand. - _deletedLists.clear(); -} - -const DiskLoc& DummyRecordStoreV1MetaData::firstExtent(OperationContext* opCtx) const { - return _firstExtent; -} - -void DummyRecordStoreV1MetaData::setFirstExtent(OperationContext* opCtx, const DiskLoc& loc) { - _firstExtent = loc; -} - -const DiskLoc& DummyRecordStoreV1MetaData::lastExtent(OperationContext* opCtx) const { - return _lastExtent; -} - -void DummyRecordStoreV1MetaData::setLastExtent(OperationContext* opCtx, const DiskLoc& loc) { - _lastExtent = loc; -} - -bool DummyRecordStoreV1MetaData::isCapped() const { - return _capped; -} - -bool DummyRecordStoreV1MetaData::isUserFlagSet(int flag) const { - return _userFlags & flag; -} - -bool DummyRecordStoreV1MetaData::setUserFlag(OperationContext* opCtx, int flag) { - if ((_userFlags & flag) == flag) - return false; - - _userFlags |= flag; - return true; -} -bool DummyRecordStoreV1MetaData::clearUserFlag(OperationContext* opCtx, int flag) { - if ((_userFlags & flag) == 0) - return false; - - _userFlags &= ~flag; - return true; -} -bool DummyRecordStoreV1MetaData::replaceUserFlags(OperationContext* opCtx, int flags) { - if (_userFlags == flags) - return false; - _userFlags = flags; - return true; -} - - -int DummyRecordStoreV1MetaData::lastExtentSize(OperationContext* opCtx) const { - return _lastExtentSize; -} - -void DummyRecordStoreV1MetaData::setLastExtentSize(OperationContext* opCtx, int newMax) { - _lastExtentSize = newMax; -} - -long long DummyRecordStoreV1MetaData::maxCappedDocs() const { - return _maxCappedDocs; -} - -// ----------------------------------------- - -DummyExtentManager::~DummyExtentManager() { - for (size_t i = 0; i < _extents.size(); i++) { - if (_extents[i].data) - free(_extents[i].data); - } -} - -void DummyExtentManager::close(OperationContext* opCtx) {} - -Status DummyExtentManager::init(OperationContext* opCtx) { - return Status::OK(); -} - -int DummyExtentManager::numFiles() const { - return static_cast<int>(_extents.size()); -} - -long long DummyExtentManager::fileSize() const { - MONGO_UNREACHABLE; -} - -DiskLoc DummyExtentManager::allocateExtent(OperationContext* opCtx, - bool capped, - int size, - bool enforceQuota) { - size = quantizeExtentSize(size); - - ExtentInfo info; - info.data = static_cast<char*>(mongoMalloc(size)); - info.length = size; - - DiskLoc loc(_extents.size(), 0); - _extents.push_back(info); - - Extent* e = getExtent(loc, false); - e->magic = Extent::extentSignature; - e->myLoc = loc; - e->xnext.Null(); - e->xprev.Null(); - e->length = size; - e->firstRecord.Null(); - e->lastRecord.Null(); - - return loc; -} - -void DummyExtentManager::freeExtents(OperationContext* opCtx, DiskLoc firstExt, DiskLoc lastExt) { - // XXX -} - -void DummyExtentManager::freeExtent(OperationContext* opCtx, DiskLoc extent) { - // XXX -} -void DummyExtentManager::freeListStats(OperationContext* opCtx, - int* numExtents, - int64_t* totalFreeSizeBytes) const { - MONGO_UNREACHABLE; -} - -std::unique_ptr<RecordFetcher> DummyExtentManager::recordNeedsFetch(const DiskLoc& loc) const { - return {}; -} - -MmapV1RecordHeader* DummyExtentManager::recordForV1(const DiskLoc& loc) const { - if (static_cast<size_t>(loc.a()) >= _extents.size()) - return NULL; - if (static_cast<size_t>(loc.getOfs()) >= _extents[loc.a()].length) - return NULL; - char* root = _extents[loc.a()].data; - return reinterpret_cast<MmapV1RecordHeader*>(root + loc.getOfs()); -} - -Extent* DummyExtentManager::extentForV1(const DiskLoc& loc) const { - MONGO_UNREACHABLE; -} - -DiskLoc DummyExtentManager::extentLocForV1(const DiskLoc& loc) const { - return DiskLoc(loc.a(), 0); -} - -Extent* DummyExtentManager::getExtent(const DiskLoc& loc, bool doSanityCheck) const { - invariant(!loc.isNull()); - invariant(static_cast<size_t>(loc.a()) < _extents.size()); - invariant(loc.getOfs() == 0); - Extent* ext = reinterpret_cast<Extent*>(_extents[loc.a()].data); - if (doSanityCheck) - ext->assertOk(); - return ext; -} - -int DummyExtentManager::maxSize() const { - return 1024 * 1024 * 64; -} - -DummyExtentManager::CacheHint* DummyExtentManager::cacheHint(const DiskLoc& extentLoc, - const HintType& hint) { - return new CacheHint(); -} - -DataFileVersion DummyExtentManager::getFileFormat(OperationContext* opCtx) const { - return DataFileVersion::defaultForNewFiles(); -} - -void DummyExtentManager::setFileFormat(OperationContext* opCtx, DataFileVersion newVersion) {} - -const DataFile* DummyExtentManager::getOpenFile(int n) const { - return nullptr; -} - -namespace { -void accumulateExtentSizeRequirements(const LocAndSize* las, std::map<int, size_t>* sizes) { - if (!las) - return; - - while (!las->loc.isNull()) { - // We require passed in offsets to be > 1000 to leave room for Extent headers. - invariant(Extent::HeaderSize() < 1000); - invariant(las->loc.getOfs() >= 1000); - - const size_t end = las->loc.getOfs() + las->size; - size_t& sizeNeeded = (*sizes)[las->loc.a()]; - sizeNeeded = std::max(sizeNeeded, end); - las++; - } -} - -void printRecList(OperationContext* opCtx, - const ExtentManager* em, - const RecordStoreV1MetaData* md) { - log() << " *** BEGIN ACTUAL RECORD LIST *** "; - DiskLoc extLoc = md->firstExtent(opCtx); - std::set<DiskLoc> seenLocs; - while (!extLoc.isNull()) { - Extent* ext = em->getExtent(extLoc, true); - DiskLoc actualLoc = ext->firstRecord; - while (!actualLoc.isNull()) { - const MmapV1RecordHeader* actualRec = em->recordForV1(actualLoc); - const int actualSize = actualRec->lengthWithHeaders(); - - log() << "loc: " << actualLoc // <--hex - << " (" << actualLoc.getOfs() << ")" - << " size: " << actualSize << " prev: " << actualRec->prevOfs() - << " next: " << actualRec->nextOfs() - << (actualLoc == md->capFirstNewRecord() ? " (CAP_FIRST_NEW)" : ""); - - const bool foundCycle = !seenLocs.insert(actualLoc).second; - invariant(!foundCycle); - - const int nextOfs = actualRec->nextOfs(); - actualLoc = (nextOfs == DiskLoc::NullOfs ? DiskLoc() : DiskLoc(actualLoc.a(), nextOfs)); - } - extLoc = ext->xnext; - } - log() << " *** END ACTUAL RECORD LIST *** "; -} - -void printDRecList(const ExtentManager* em, const RecordStoreV1MetaData* md) { - log() << " *** BEGIN ACTUAL DELETED RECORD LIST *** "; - std::set<DiskLoc> seenLocs; - for (int bucketIdx = 0; bucketIdx < RecordStoreV1Base::Buckets; bucketIdx++) { - DiskLoc actualLoc = md->deletedListEntry(bucketIdx); - while (!actualLoc.isNull()) { - const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted(); - const int actualSize = actualDrec->lengthWithHeaders(); - - log() << "loc: " << actualLoc // <--hex - << " (" << actualLoc.getOfs() << ")" - << " size: " << actualSize << " bucket: " << bucketIdx - << " next: " << actualDrec->nextDeleted(); - - const bool foundCycle = !seenLocs.insert(actualLoc).second; - invariant(!foundCycle); - - actualLoc = actualDrec->nextDeleted(); - } - - // Only print bucket 0 in capped collections since it contains all deleted records - if (md->isCapped()) - break; - } - log() << " *** END ACTUAL DELETED RECORD LIST *** "; -} -} - -void initializeV1RS(OperationContext* opCtx, - const LocAndSize* records, - const LocAndSize* drecs, - const LocAndSize* legacyGrabBag, - DummyExtentManager* em, - DummyRecordStoreV1MetaData* md) { - invariant(records || drecs); // if both are NULL nothing is being created... - - // Need to start with a blank slate - invariant(em->numFiles() == 0); - invariant(md->firstExtent(opCtx).isNull()); - - // pre-allocate extents (even extents that aren't part of this RS) - { - typedef std::map<int, size_t> ExtentSizes; - ExtentSizes extentSizes; - accumulateExtentSizeRequirements(records, &extentSizes); - accumulateExtentSizeRequirements(drecs, &extentSizes); - accumulateExtentSizeRequirements(legacyGrabBag, &extentSizes); - invariant(!extentSizes.empty()); - - const int maxExtent = extentSizes.rbegin()->first; - for (int i = 0; i <= maxExtent; i++) { - const size_t size = extentSizes.count(i) ? extentSizes[i] : 0; - const DiskLoc loc = em->allocateExtent(opCtx, md->isCapped(), size, 0); - - // This function and assertState depend on these details of DummyExtentManager - invariant(loc.a() == i); - invariant(loc.getOfs() == 0); - } - - // link together extents that should be part of this RS - md->setFirstExtent(opCtx, DiskLoc(extentSizes.begin()->first, 0)); - md->setLastExtent(opCtx, DiskLoc(extentSizes.rbegin()->first, 0)); - for (ExtentSizes::iterator it = extentSizes.begin(); boost::next(it) != extentSizes.end(); - /* ++it */) { - const int a = it->first; - ++it; - const int b = it->first; - em->getExtent(DiskLoc(a, 0))->xnext = DiskLoc(b, 0); - em->getExtent(DiskLoc(b, 0))->xprev = DiskLoc(a, 0); - } - - // This signals "done allocating new extents". - if (md->isCapped()) - md->setDeletedListEntry(opCtx, 1, DiskLoc()); - } - - if (records && !records[0].loc.isNull()) { - int recIdx = 0; - DiskLoc extLoc = md->firstExtent(opCtx); - while (!extLoc.isNull()) { - Extent* ext = em->getExtent(extLoc); - int prevOfs = DiskLoc::NullOfs; - while (extLoc.a() == records[recIdx].loc.a()) { // for all records in this extent - const DiskLoc loc = records[recIdx].loc; - const int size = records[recIdx].size; - ; - invariant(size >= MmapV1RecordHeader::HeaderSize); - - md->incrementStats(opCtx, size - MmapV1RecordHeader::HeaderSize, 1); - - if (ext->firstRecord.isNull()) - ext->firstRecord = loc; - - MmapV1RecordHeader* rec = em->recordForV1(loc); - rec->lengthWithHeaders() = size; - rec->extentOfs() = 0; - - rec->prevOfs() = prevOfs; - prevOfs = loc.getOfs(); - - const DiskLoc nextLoc = records[recIdx + 1].loc; - if (nextLoc.a() == loc.a()) { // if next is in same extent - rec->nextOfs() = nextLoc.getOfs(); - } else { - rec->nextOfs() = DiskLoc::NullOfs; - ext->lastRecord = loc; - } - - recIdx++; - } - extLoc = ext->xnext; - } - invariant(records[recIdx].loc.isNull()); - } - - if (drecs && !drecs[0].loc.isNull()) { - int drecIdx = 0; - DiskLoc* prevNextPtr = NULL; - int lastBucket = -1; - while (!drecs[drecIdx].loc.isNull()) { - const DiskLoc loc = drecs[drecIdx].loc; - const int size = drecs[drecIdx].size; - invariant(size >= MmapV1RecordHeader::HeaderSize); - const int bucket = RecordStoreV1Base::bucket(size); - - if (md->isCapped()) { - // All drecs form a single list in bucket 0 - if (prevNextPtr == NULL) { - md->setDeletedListEntry(opCtx, 0, loc); - } else { - *prevNextPtr = loc; - } - - if (loc.a() < md->capExtent().a() && - drecs[drecIdx + 1].loc.a() == md->capExtent().a()) { - // Bucket 1 is known as cappedLastDelRecLastExtent - md->setDeletedListEntry(opCtx, 1, loc); - } - } else if (bucket != lastBucket) { - invariant(bucket > lastBucket); // if this fails, drecs weren't sorted by bucket - md->setDeletedListEntry(opCtx, bucket, loc); - lastBucket = bucket; - } else { - *prevNextPtr = loc; - } - - DeletedRecord* drec = &em->recordForV1(loc)->asDeleted(); - drec->lengthWithHeaders() = size; - drec->extentOfs() = 0; - drec->nextDeleted() = DiskLoc(); - prevNextPtr = &drec->nextDeleted(); - - drecIdx++; - } - } - - if (legacyGrabBag && !legacyGrabBag[0].loc.isNull()) { - invariant(!md->isCapped()); // capped should have an empty legacy grab bag. - - int grabBagIdx = 0; - DiskLoc* prevNextPtr = NULL; - while (!legacyGrabBag[grabBagIdx].loc.isNull()) { - const DiskLoc loc = legacyGrabBag[grabBagIdx].loc; - const int size = legacyGrabBag[grabBagIdx].size; - invariant(size >= MmapV1RecordHeader::HeaderSize); - - if (grabBagIdx == 0) { - md->setDeletedListLegacyGrabBag(opCtx, loc); - } else { - *prevNextPtr = loc; - } - - DeletedRecord* drec = &em->recordForV1(loc)->asDeleted(); - drec->lengthWithHeaders() = size; - drec->extentOfs() = 0; - drec->nextDeleted() = DiskLoc(); - prevNextPtr = &drec->nextDeleted(); - - grabBagIdx++; - } - } - - // Make sure we set everything up as requested. - assertStateV1RS(opCtx, records, drecs, legacyGrabBag, em, md); -} - -void assertStateV1RS(OperationContext* opCtx, - const LocAndSize* records, - const LocAndSize* drecs, - const LocAndSize* legacyGrabBag, - const ExtentManager* em, - const DummyRecordStoreV1MetaData* md) { - invariant(records || drecs); // if both are NULL nothing is being asserted... - - try { - if (records) { - long long dataSize = 0; - long long numRecs = 0; - - int recIdx = 0; - - DiskLoc extLoc = md->firstExtent(opCtx); - while (!extLoc.isNull()) { // for each Extent - Extent* ext = em->getExtent(extLoc, true); - int expectedPrevOfs = DiskLoc::NullOfs; - DiskLoc actualLoc = ext->firstRecord; - while (!actualLoc.isNull()) { // for each MmapV1RecordHeader in this Extent - const MmapV1RecordHeader* actualRec = em->recordForV1(actualLoc); - const int actualSize = actualRec->lengthWithHeaders(); - - dataSize += actualSize - MmapV1RecordHeader::HeaderSize; - numRecs += 1; - - ASSERT_EQUALS(actualLoc, records[recIdx].loc); - ASSERT_EQUALS(actualSize, records[recIdx].size); - - ASSERT_EQUALS(actualRec->extentOfs(), extLoc.getOfs()); - ASSERT_EQUALS(actualRec->prevOfs(), expectedPrevOfs); - expectedPrevOfs = actualLoc.getOfs(); - - recIdx++; - const int nextOfs = actualRec->nextOfs(); - actualLoc = - (nextOfs == DiskLoc::NullOfs ? DiskLoc() : DiskLoc(actualLoc.a(), nextOfs)); - } - - if (ext->xnext.isNull()) { - ASSERT_EQUALS(md->lastExtent(opCtx), extLoc); - } - - extLoc = ext->xnext; - } - - // both the expected and actual record lists must be done at this point - ASSERT_EQUALS(records[recIdx].loc, DiskLoc()); - - ASSERT_EQUALS(dataSize, md->dataSize()); - ASSERT_EQUALS(numRecs, md->numRecords()); - } - - if (drecs) { - int drecIdx = 0; - for (int bucketIdx = 0; bucketIdx < RecordStoreV1Base::Buckets; bucketIdx++) { - DiskLoc actualLoc = md->deletedListEntry(bucketIdx); - - if (md->isCapped() && bucketIdx == 1) { - // In capped collections, the 2nd bucket (index 1) points to the drec before - // the first drec in the capExtent. If the capExtent is the first Extent, - // it should be Null. - - if (md->capExtent() == md->firstExtent(opCtx)) { - ASSERT_EQUALS(actualLoc, DiskLoc()); - } else { - ASSERT_NOT_EQUALS(actualLoc.a(), md->capExtent().a()); - const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted(); - ASSERT_EQUALS(actualDrec->nextDeleted().a(), md->capExtent().a()); - } - - // Don't do normal checking of bucket 1 in capped collections. Checking - // other buckets to verify that they are Null. - continue; - } - - while (!actualLoc.isNull()) { - const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted(); - const int actualSize = actualDrec->lengthWithHeaders(); - - ASSERT_EQUALS(actualLoc, drecs[drecIdx].loc); - ASSERT_EQUALS(actualSize, drecs[drecIdx].size); - - // Make sure the drec is correct - ASSERT_EQUALS(actualDrec->extentOfs(), 0); - - // in capped collections all drecs are linked into a single list in bucket 0 - ASSERT_EQUALS(bucketIdx, - md->isCapped() ? 0 : RecordStoreV1Base::bucket(actualSize)); - - drecIdx++; - actualLoc = actualDrec->nextDeleted(); - } - } - // both the expected and actual deleted lists must be done at this point - ASSERT_EQUALS(drecs[drecIdx].loc, DiskLoc()); - } - - if (legacyGrabBag) { - int grabBagIdx = 0; - DiskLoc actualLoc = md->deletedListLegacyGrabBag(); - while (!actualLoc.isNull()) { - const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted(); - const int actualSize = actualDrec->lengthWithHeaders(); - - ASSERT_EQUALS(actualLoc, legacyGrabBag[grabBagIdx].loc); - ASSERT_EQUALS(actualSize, legacyGrabBag[grabBagIdx].size); - - grabBagIdx++; - actualLoc = actualDrec->nextDeleted(); - } - - // both the expected and actual deleted lists must be done at this point - ASSERT_EQUALS(legacyGrabBag[grabBagIdx].loc, DiskLoc()); - } else { - // Unless a test is actually using the grabBag it should be empty - ASSERT_EQUALS(md->deletedListLegacyGrabBag(), DiskLoc()); - } - } catch (...) { - // If a test fails, provide extra info to make debugging easier - printRecList(opCtx, em, md); - printDRecList(em, md); - throw; - } -} -} diff --git a/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h b/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h deleted file mode 100644 index c9af1e5cc36..00000000000 --- a/src/mongo/db/storage/mmap_v1/record_store_v1_test_help.h +++ /dev/null @@ -1,211 +0,0 @@ -// record_store_v1_test_help.h - -/** -* Copyright (C) 2014 MongoDB Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#pragma once - -#include <vector> - -#include "mongo/db/storage/mmap_v1/data_file.h" -#include "mongo/db/storage/mmap_v1/extent_manager.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_base.h" - -namespace mongo { - -class DummyRecordStoreV1MetaData : public RecordStoreV1MetaData { -public: - DummyRecordStoreV1MetaData(bool capped, int userFlags); - virtual ~DummyRecordStoreV1MetaData() {} - - virtual const DiskLoc& capExtent() const; - virtual void setCapExtent(OperationContext* opCtx, const DiskLoc& loc); - - virtual const DiskLoc& capFirstNewRecord() const; - virtual void setCapFirstNewRecord(OperationContext* opCtx, const DiskLoc& loc); - - virtual long long dataSize() const; - virtual long long numRecords() const; - - virtual void incrementStats(OperationContext* opCtx, - long long dataSizeIncrement, - long long numRecordsIncrement); - - virtual void setStats(OperationContext* opCtx, long long dataSize, long long numRecords); - - virtual DiskLoc deletedListEntry(int bucket) const; - virtual void setDeletedListEntry(OperationContext* opCtx, int bucket, const DiskLoc& loc); - - virtual DiskLoc deletedListLegacyGrabBag() const; - virtual void setDeletedListLegacyGrabBag(OperationContext* opCtx, const DiskLoc& loc); - - virtual void orphanDeletedList(OperationContext* opCtx); - - virtual const DiskLoc& firstExtent(OperationContext* opCtx) const; - virtual void setFirstExtent(OperationContext* opCtx, const DiskLoc& loc); - - virtual const DiskLoc& lastExtent(OperationContext* opCtx) const; - virtual void setLastExtent(OperationContext* opCtx, const DiskLoc& loc); - - virtual bool isCapped() const; - - virtual bool isUserFlagSet(int flag) const; - virtual int userFlags() const { - return _userFlags; - } - virtual bool setUserFlag(OperationContext* opCtx, int flag); - virtual bool clearUserFlag(OperationContext* opCtx, int flag); - virtual bool replaceUserFlags(OperationContext* opCtx, int flags); - - - virtual int lastExtentSize(OperationContext* opCtx) const; - virtual void setLastExtentSize(OperationContext* opCtx, int newMax); - - virtual long long maxCappedDocs() const; - -protected: - DiskLoc _capExtent; - DiskLoc _capFirstNewRecord; - - long long _dataSize; - long long _numRecords; - - DiskLoc _firstExtent; - DiskLoc _lastExtent; - - bool _capped; - int _userFlags; - long long _maxCappedDocs; - - int _lastExtentSize; - double _paddingFactor; - - std::vector<DiskLoc> _deletedLists; - DiskLoc _deletedListLegacyGrabBag; -}; - -class DummyExtentManager : public ExtentManager { -public: - virtual ~DummyExtentManager(); - - virtual void close(OperationContext* opCtx); - - virtual Status init(OperationContext* opCtx); - - virtual int numFiles() const; - virtual long long fileSize() const; - - virtual DiskLoc allocateExtent(OperationContext* opCtx, - bool capped, - int size, - bool enforceQuota); - - virtual void freeExtents(OperationContext* opCtx, DiskLoc firstExt, DiskLoc lastExt); - - virtual void freeExtent(OperationContext* opCtx, DiskLoc extent); - - virtual void freeListStats(OperationContext* opCtx, - int* numExtents, - int64_t* totalFreeSizeBytes) const; - - virtual MmapV1RecordHeader* recordForV1(const DiskLoc& loc) const; - - virtual std::unique_ptr<RecordFetcher> recordNeedsFetch(const DiskLoc& loc) const final; - - virtual Extent* extentForV1(const DiskLoc& loc) const; - - virtual DiskLoc extentLocForV1(const DiskLoc& loc) const; - - virtual Extent* getExtent(const DiskLoc& loc, bool doSanityCheck = true) const; - - virtual int maxSize() const; - - virtual CacheHint* cacheHint(const DiskLoc& extentLoc, const HintType& hint); - - DataFileVersion getFileFormat(OperationContext* opCtx) const final; - - virtual void setFileFormat(OperationContext* opCtx, DataFileVersion newVersion) final; - - const DataFile* getOpenFile(int n) const final; - - -protected: - struct ExtentInfo { - char* data; - size_t length; - }; - - std::vector<ExtentInfo> _extents; -}; - -struct LocAndSize { - DiskLoc loc; - int size; // with headers -}; - -/** - * Creates a V1 storage/mmap_v1 with the passed in records and DeletedRecords (drecs). - * - * List of LocAndSize are terminated by a Null DiskLoc. Passing a NULL pointer is shorthand for - * an empty list. Each extent gets it's own DiskLoc file number. DiskLoc Offsets must be > 1000. - * - * records must be sorted by extent/file. offsets within an extent can be in any order. - * - * In a simple RS, drecs must be grouped into size-buckets, but the ordering within the size - * buckets is up to you. - * - * In a capped collection, all drecs form a single list and must be grouped by extent, with each - * extent having at least one drec. capFirstNewRecord() and capExtent() *must* be correctly set - * on md before calling. - * - * You are responsible for ensuring the records and drecs don't overlap. - * - * ExtentManager and MetaData must both be empty. - */ -void initializeV1RS(OperationContext* opCtx, - const LocAndSize* records, - const LocAndSize* drecs, - const LocAndSize* legacyGrabBag, - DummyExtentManager* em, - DummyRecordStoreV1MetaData* md); - -/** - * Asserts that the V1RecordStore defined by md has the passed in records and drecs in the - * correct order. - * - * List of LocAndSize are terminated by a Null DiskLoc. Passing a NULL pointer means don't check - * that list. - */ -void assertStateV1RS(OperationContext* opCtx, - const LocAndSize* records, - const LocAndSize* drecs, - const LocAndSize* legacyGrabBag, - const ExtentManager* em, - const DummyRecordStoreV1MetaData* md); - -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/repair_database.cpp b/src/mongo/db/storage/mmap_v1/repair_database.cpp deleted file mode 100644 index 416ff14063e..00000000000 --- a/src/mongo/db/storage/mmap_v1/repair_database.cpp +++ /dev/null @@ -1,499 +0,0 @@ -// repair_database.cpp - -/** -* Copyright (C) 2014 MongoDB Inc. -* -* This program is free software: you can redistribute it and/or modify -* it under the terms of the GNU Affero General Public License, version 3, -* as published by the Free Software Foundation. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU Affero General Public License for more details. -* -* You should have received a copy of the GNU Affero General Public License -* along with this program. If not, see <http://www.gnu.org/licenses/>. -* -* As a special exception, the copyright holders give permission to link the -* code of portions of this program with the OpenSSL library under certain -* conditions as described in each individual source file and distribute -* linked combinations including the program with the OpenSSL library. You -* must comply with the GNU Affero General Public License in all respects for -* all of the code used other than as permitted herein. If you modify file(s) -* with this exception, you may extend this exception to your version of the -* file(s), but you are not obligated to do so. If you do not wish to do so, -* delete this exception statement from your version. If you delete this -* exception statement from all source files in the program, then also delete -* it in the license file. -*/ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kStorage - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/mmap_v1_engine.h" - -#include <boost/filesystem/operations.hpp> - -#include "mongo/db/background.h" -#include "mongo/db/catalog/collection.h" -#include "mongo/db/catalog/database.h" -#include "mongo/db/catalog/database_holder.h" -#include "mongo/db/catalog/index_create.h" -#include "mongo/db/catalog/uuid_catalog.h" -#include "mongo/db/client.h" -#include "mongo/db/db_raii.h" -#include "mongo/db/index/index_descriptor.h" -#include "mongo/db/storage/mmap_v1/dur.h" -#include "mongo/db/storage/mmap_v1/file_allocator.h" -#include "mongo/db/storage/mmap_v1/mmap.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_database_catalog_entry.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" -#include "mongo/db/storage/mmap_v1/repair_database_interface.h" -#include "mongo/util/file.h" -#include "mongo/util/log.h" -#include "mongo/util/scopeguard.h" - -namespace mongo { - -using std::unique_ptr; -using std::endl; -using std::map; -using std::string; -using std::stringstream; -using std::vector; - -typedef boost::filesystem::path Path; - -// inheritable class to implement an operation that may be applied to all -// files in a database using _applyOpToDataFiles() -class FileOp { -public: - virtual ~FileOp() {} - // Return true if file exists and operation successful - virtual bool apply(const boost::filesystem::path& p) = 0; - virtual const char* op() const = 0; -}; - -void _applyOpToDataFiles(const string& database, - FileOp& fo, - bool afterAllocator = false, - const string& path = storageGlobalParams.dbpath); - -void _deleteDataFiles(const std::string& database) { - if (storageGlobalParams.directoryperdb) { - FileAllocator::get()->waitUntilFinished(); - MONGO_ASSERT_ON_EXCEPTION_WITH_MSG( - boost::filesystem::remove_all(boost::filesystem::path(storageGlobalParams.dbpath) / - database), - "delete data files with a directoryperdb"); - return; - } - class : public FileOp { - virtual bool apply(const boost::filesystem::path& p) { - return boost::filesystem::remove(p); - } - virtual const char* op() const { - return "remove"; - } - } deleter; - _applyOpToDataFiles(database, deleter, true); -} - -void boostRenameWrapper(const Path& from, const Path& to) { - try { - boost::filesystem::rename(from, to); - } catch (const boost::filesystem::filesystem_error&) { - // boost rename doesn't work across partitions - boost::filesystem::copy_file(from, to); - boost::filesystem::remove(from); - } -} - -// back up original database files to 'temp' dir -void _renameForBackup(const std::string& database, const Path& reservedPath) { - Path newPath(reservedPath); - if (storageGlobalParams.directoryperdb) - newPath /= database; - class Renamer : public FileOp { - public: - Renamer(const Path& newPath) : newPath_(newPath) {} - - private: - const boost::filesystem::path& newPath_; - virtual bool apply(const Path& p) { - if (!boost::filesystem::exists(p)) - return false; - boostRenameWrapper(p, newPath_ / (p.leaf().string() + ".bak")); - return true; - } - virtual const char* op() const { - return "renaming"; - } - } renamer(newPath); - _applyOpToDataFiles(database, renamer, true); -} - -intmax_t dbSize(const string& database) { - class SizeAccumulator : public FileOp { - public: - SizeAccumulator() : totalSize_(0) {} - intmax_t size() const { - return totalSize_; - } - - private: - virtual bool apply(const boost::filesystem::path& p) { - if (!boost::filesystem::exists(p)) - return false; - totalSize_ += boost::filesystem::file_size(p); - return true; - } - virtual const char* op() const { - return "checking size"; - } - intmax_t totalSize_; - }; - SizeAccumulator sa; - _applyOpToDataFiles(database, sa); - return sa.size(); -} - -// move temp files to standard data dir -void _replaceWithRecovered(const string& database, const char* reservedPathString) { - Path newPath(storageGlobalParams.dbpath); - if (storageGlobalParams.directoryperdb) - newPath /= database; - class Replacer : public FileOp { - public: - Replacer(const Path& newPath) : newPath_(newPath) {} - - private: - const boost::filesystem::path& newPath_; - virtual bool apply(const Path& p) { - if (!boost::filesystem::exists(p)) - return false; - boostRenameWrapper(p, newPath_ / p.leaf()); - return true; - } - virtual const char* op() const { - return "renaming"; - } - } replacer(newPath); - _applyOpToDataFiles(database, replacer, true, reservedPathString); -} - -// generate a directory name for storing temp data files -Path uniqueReservedPath(const char* prefix) { - Path repairPath = Path(storageGlobalParams.repairpath); - Path reservedPath; - int i = 0; - bool exists = false; - do { - stringstream ss; - ss << prefix << "_repairDatabase_" << i++; - reservedPath = repairPath / ss.str(); - MONGO_ASSERT_ON_EXCEPTION(exists = boost::filesystem::exists(reservedPath)); - } while (exists); - return reservedPath; -} - -void _applyOpToDataFiles(const string& database, - FileOp& fo, - bool afterAllocator, - const string& path) { - if (afterAllocator) - FileAllocator::get()->waitUntilFinished(); - string c = database; - c += '.'; - boost::filesystem::path p(path); - if (storageGlobalParams.directoryperdb) - p /= database; - boost::filesystem::path q; - q = p / (c + "ns"); - bool ok = false; - MONGO_ASSERT_ON_EXCEPTION(ok = fo.apply(q)); - if (ok) { - LOG(2) << fo.op() << " file " << q.string() << endl; - } - int i = 0; - int extra = 10; // should not be necessary, this is defensive in case there are missing files - while (1) { - verify(i <= DiskLoc::MaxFiles); - stringstream ss; - ss << c << i; - q = p / ss.str(); - MONGO_ASSERT_ON_EXCEPTION(ok = fo.apply(q)); - if (ok) { - if (extra != 10) { - LOG(1) << fo.op() << " file " << q.string() << endl; - log() << " _applyOpToDataFiles() warning: extra == " << extra << endl; - } - } else if (--extra <= 0) - break; - i++; - } -} - -class RepairFileDeleter { -public: - RepairFileDeleter(OperationContext* opCtx, - const string& dbName, - const string& pathString, - const Path& path) - : _opCtx(opCtx), _dbName(dbName), _pathString(pathString), _path(path), _success(false) {} - - ~RepairFileDeleter() { - if (_success) - return; - - log() << "cleaning up failed repair " - << "db: " << _dbName << " path: " << _pathString; - - try { - getDur().syncDataAndTruncateJournal(_opCtx); - - // need both in case journaling is disabled - MongoFile::flushAll(_opCtx, true); - - MONGO_ASSERT_ON_EXCEPTION(boost::filesystem::remove_all(_path)); - } catch (DBException& e) { - error() << "RepairFileDeleter failed to cleanup: " << redact(e); - error() << "aborting"; - fassertFailed(17402); - } - } - - void success() { - _success = true; - } - -private: - OperationContext* _opCtx; - string _dbName; - string _pathString; - Path _path; - bool _success; -}; - -Status MMAPV1Engine::repairDatabase(OperationContext* opCtx, - const std::string& dbName, - bool preserveClonedFilesOnFailure, - bool backupOriginalFiles) { - unique_ptr<RepairFileDeleter> repairFileDeleter; - - // Must be done before and after repair - getDur().syncDataAndTruncateJournal(opCtx); - - intmax_t totalSize = dbSize(dbName); - intmax_t freeSize = File::freeSpace(storageGlobalParams.repairpath); - - if (freeSize > -1 && freeSize < totalSize) { - return Status(ErrorCodes::OutOfDiskSpace, - str::stream() << "Cannot repair database " << dbName << " having size: " - << totalSize - << " (bytes) because free disk space is: " - << freeSize - << " (bytes)"); - } - - opCtx->checkForInterrupt(); - - Path reservedPath = uniqueReservedPath( - (preserveClonedFilesOnFailure || backupOriginalFiles) ? "backup" : "_tmp"); - bool created = false; - MONGO_ASSERT_ON_EXCEPTION(created = boost::filesystem::create_directory(reservedPath)); - invariant(created); - string reservedPathString = reservedPath.string(); - - if (!preserveClonedFilesOnFailure) - repairFileDeleter.reset( - new RepairFileDeleter(opCtx, dbName, reservedPathString, reservedPath)); - - { - Database* originalDatabase = DatabaseHolder::getDatabaseHolder().openDb(opCtx, dbName); - if (originalDatabase == NULL) { - return Status(ErrorCodes::NamespaceNotFound, "database does not exist to repair"); - } - - unique_ptr<MMAPV1DatabaseCatalogEntry> dbEntry; - unique_ptr<Database> tempDatabase; - - // Must call this before MMAPV1DatabaseCatalogEntry's destructor closes the DB files - ON_BLOCK_EXIT([&dbEntry, &opCtx, &tempDatabase] { - getDur().syncDataAndTruncateJournal(opCtx); - UUIDCatalog::get(opCtx).onCloseDatabase(tempDatabase.get()); - dbEntry->close(opCtx); - }); - - { - dbEntry.reset(new MMAPV1DatabaseCatalogEntry( - opCtx, - dbName, - reservedPathString, - storageGlobalParams.directoryperdb, - true, - _extentManagerFactory->create( - dbName, reservedPathString, storageGlobalParams.directoryperdb))); - tempDatabase.reset(new Database(opCtx, dbName, dbEntry.get())); - } - - map<string, CollectionOptions> namespacesToCopy; - { - NamespaceString nss(dbName, "system.namespaces"); - OldClientContext ctx(opCtx, nss.ns()); - Collection* coll = originalDatabase->getCollection(opCtx, nss); - if (coll) { - auto cursor = coll->getCursor(opCtx); - while (auto record = cursor->next()) { - BSONObj obj = record->data.releaseToBson(); - - string ns = obj["name"].String(); - - NamespaceString nss(ns); - if (nss.isSystem()) { - if (nss.isSystemDotIndexes()) - continue; - if (nss.coll() == "system.namespaces") - continue; - } - - if (!nss.isNormal()) - continue; - - CollectionOptions options; - if (obj["options"].isABSONObj()) { - Status status = - options.parse(obj["options"].Obj(), CollectionOptions::parseForStorage); - if (!status.isOK()) - return status; - } - namespacesToCopy[ns] = options; - } - } - } - - for (map<string, CollectionOptions>::const_iterator i = namespacesToCopy.begin(); - i != namespacesToCopy.end(); - ++i) { - string ns = i->first; - NamespaceString nss(ns); - CollectionOptions options = i->second; - - Collection* tempCollection = NULL; - { - WriteUnitOfWork wunit(opCtx); - if (options.uuid) { - UUIDCatalog::get(opCtx).onDropCollection(opCtx, options.uuid.get()); - } - tempCollection = tempDatabase->createCollection(opCtx, ns, options, false); - wunit.commit(); - } - - OldClientContext readContext(opCtx, ns, originalDatabase); - Collection* originalCollection = originalDatabase->getCollection(opCtx, nss); - invariant(originalCollection); - - // data - - // TODO SERVER-14812 add a mode that drops duplicates rather than failing - MultiIndexBlock indexer(opCtx, tempCollection); - { - vector<BSONObj> indexes; - IndexCatalog::IndexIterator ii = - originalCollection->getIndexCatalog()->getIndexIterator(opCtx, false); - while (ii.more()) { - IndexDescriptor* desc = ii.next(); - indexes.push_back(desc->infoObj()); - } - - Status status = indexer.init(indexes).getStatus(); - if (!status.isOK()) { - return status; - } - } - - std::vector<MultiIndexBlock*> indexers{&indexer}; - auto cursor = originalCollection->getCursor(opCtx); - while (auto record = cursor->next()) { - BSONObj doc = record->data.releaseToBson(); - - WriteUnitOfWork wunit(opCtx); - Status status = tempCollection->insertDocument(opCtx, doc, indexers, false); - if (!status.isOK()) - return status; - - wunit.commit(); - opCtx->checkForInterrupt(); - } - - Status status = indexer.doneInserting(); - if (!status.isOK()) - return status; - - { - WriteUnitOfWork wunit(opCtx); - indexer.commit(); - wunit.commit(); - } - } - - getDur().syncDataAndTruncateJournal(opCtx); - - // need both in case journaling is disabled - MongoFile::flushAll(opCtx, true); - - opCtx->checkForInterrupt(); - } - - // at this point if we abort, we don't want to delete new files - // as they might be the only copies - - if (repairFileDeleter.get()) - repairFileDeleter->success(); - - // Close the database so we can rename/delete the original data files - DatabaseHolder::getDatabaseHolder().close(opCtx, dbName, "database closed for repair"); - - if (backupOriginalFiles) { - _renameForBackup(dbName, reservedPath); - } else { - // first make new directory before deleting data - Path newDir = Path(storageGlobalParams.dbpath) / dbName; - MONGO_ASSERT_ON_EXCEPTION(boost::filesystem::create_directory(newDir)); - - // this deletes old files - _deleteDataFiles(dbName); - - if (!boost::filesystem::exists(newDir)) { - // we deleted because of directoryperdb - // re-create - MONGO_ASSERT_ON_EXCEPTION(boost::filesystem::create_directory(newDir)); - } - } - - _replaceWithRecovered(dbName, reservedPathString.c_str()); - - if (!backupOriginalFiles) { - MONGO_ASSERT_ON_EXCEPTION(boost::filesystem::remove_all(reservedPath)); - } - - // Reopen the database so it's discoverable - DatabaseHolder::getDatabaseHolder().openDb(opCtx, dbName); - - return Status::OK(); -} - -MONGO_INITIALIZER(RepairDatabaseMMapV1)(InitializerContext* context) { - setRepairDatabaseMmapv1Impl([](StorageEngine* engine, - OperationContext* opCtx, - const std::string& dbName, - bool preserveClonedFilesOnFailure, - bool backupOriginalFiles) { - return static_cast<MMAPV1Engine*>(engine)->repairDatabase( - opCtx, dbName, preserveClonedFilesOnFailure, backupOriginalFiles); - }); - return Status::OK(); -} -} diff --git a/src/mongo/db/storage/mmap_v1/repair_database_interface.cpp b/src/mongo/db/storage/mmap_v1/repair_database_interface.cpp deleted file mode 100644 index 6988cf76b66..00000000000 --- a/src/mongo/db/storage/mmap_v1/repair_database_interface.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/** - * Copyright (C) 2018 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#include "mongo/db/storage/mmap_v1/repair_database_interface.h" - -namespace mongo { -namespace { -stdx::function<Status(StorageEngine*, OperationContext*, const std::string&, bool, bool)> - repairDatabaseMmapv1Func; -} // namespace - -void setRepairDatabaseMmapv1Impl( - stdx::function<Status(StorageEngine* engine, OperationContext*, const std::string&, bool, bool)> - impl) { - repairDatabaseMmapv1Func = std::move(impl); -} - -Status repairDatabaseMmapv1(StorageEngine* engine, - OperationContext* opCtx, - const std::string& dbName, - bool preserveClonedFilesOnFailure, - bool backupOriginalFiles) { - return repairDatabaseMmapv1Func( - engine, opCtx, dbName, preserveClonedFilesOnFailure, backupOriginalFiles); -} -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/repair_database_interface.h b/src/mongo/db/storage/mmap_v1/repair_database_interface.h deleted file mode 100644 index c9fab68cc81..00000000000 --- a/src/mongo/db/storage/mmap_v1/repair_database_interface.h +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Copyright (C) 2018 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects for - * all of the code used other than as permitted herein. If you modify file(s) - * with this exception, you may extend this exception to your version of the - * file(s), but you are not obligated to do so. If you do not wish to do so, - * delete this exception statement from your version. If you delete this - * exception statement from all source files in the program, then also delete - * it in the license file. - */ - -#pragma once - -#include "mongo/base/status.h" -#include "mongo/stdx/functional.h" - -namespace mongo { -class StorageEngine; -class OperationContext; - -void setRepairDatabaseMmapv1Impl( - stdx::function<Status(StorageEngine*, OperationContext*, std::string const&, bool, bool)> impl); - -Status repairDatabaseMmapv1(StorageEngine* engine, - OperationContext* opCtx, - const std::string& dbName, - bool preserveClonedFilesOnFailure, - bool backupOriginalFiles); -} // namespace mongo diff --git a/src/mongo/db/storage/mmap_v1/touch_pages.cpp b/src/mongo/db/storage/mmap_v1/touch_pages.cpp deleted file mode 100644 index 7aedffe2fe3..00000000000 --- a/src/mongo/db/storage/mmap_v1/touch_pages.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright 2009 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects - * for all of the code used other than as permitted herein. If you modify - * file(s) with this exception, you may extend this exception to your - * version of the file(s), but you are not obligated to do so. If you do not - * wish to do so, delete this exception statement from your version. If you - * delete this exception statement from all source files in the program, - * then also delete it in the license file. - */ - -#include "mongo/platform/basic.h" - -#include "mongo/db/storage/mmap_v1/touch_pages.h" - -namespace mongo { - -char _touch_pages_char_reader; // goes in .bss - -void touch_pages(const char* buf, size_t length, size_t pageSize) { - // read first byte of every page, in order - for (size_t i = 0; i < length; i += pageSize) { - _touch_pages_char_reader += buf[i]; - } -} -} diff --git a/src/mongo/db/storage/mmap_v1/touch_pages.h b/src/mongo/db/storage/mmap_v1/touch_pages.h deleted file mode 100644 index c98b0e9a427..00000000000 --- a/src/mongo/db/storage/mmap_v1/touch_pages.h +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright 2009 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects - * for all of the code used other than as permitted herein. If you modify - * file(s) with this exception, you may extend this exception to your - * version of the file(s), but you are not obligated to do so. If you do not - * wish to do so, delete this exception statement from your version. If you - * delete this exception statement from all source files in the program, - * then also delete it in the license file. - */ - -#pragma once - -#include <cstdlib> - -namespace mongo { - -/** - * Touches a range of pages, to encourage the OS to get them into the buffer pool. - */ -void touch_pages(const char* buf, size_t length, size_t pageSize = 4096); -} diff --git a/src/mongo/db/storage/mobile/mobile_recovery_unit.h b/src/mongo/db/storage/mobile/mobile_recovery_unit.h index db008586f50..6f13edab943 100644 --- a/src/mongo/db/storage/mobile/mobile_recovery_unit.h +++ b/src/mongo/db/storage/mobile/mobile_recovery_unit.h @@ -62,12 +62,6 @@ public: void registerChange(Change* change) override; - void* writingPtr(void* data, size_t len) override { - MONGO_UNREACHABLE; - } - - void setRollbackWritesDisabled() override {} - SnapshotId getSnapshotId() const override { return SnapshotId(); } diff --git a/src/mongo/db/storage/recovery_unit.h b/src/mongo/db/storage/recovery_unit.h index c4acb3935bb..7a12dfe136e 100644 --- a/src/mongo/db/storage/recovery_unit.h +++ b/src/mongo/db/storage/recovery_unit.h @@ -355,50 +355,6 @@ public: registerChange(new OnCommitChange(std::move(callback))); } - // - // The remaining methods probably belong on DurRecoveryUnit rather than on the interface. - // - - /** - * Declare that the data at [x, x + len) is being written. - */ - virtual void* writingPtr(void* data, size_t len) = 0; - - // - // Syntactic sugar - // - - /** - * Declare write intent for an int - */ - inline int& writingInt(int& d) { - return *writing(&d); - } - - /** - * A templated helper for writingPtr. - */ - template <typename T> - inline T* writing(T* x) { - writingPtr(x, sizeof(T)); - return x; - } - - /** - * Sets a flag that declares this RecoveryUnit will skip rolling back writes, for the - * duration of the current outermost WriteUnitOfWork. This function can only be called - * between a pair of unnested beginUnitOfWork() / endUnitOfWork() calls. - * The flag is cleared when endUnitOfWork() is called. - * While the flag is set, rollback will skip rolling back writes, but custom rollback - * change functions are still called. Clearly, this functionality should only be used when - * writing to temporary collections that can be cleaned up externally. For example, - * foreground index builds write to a temporary collection; if something goes wrong that - * normally requires a rollback, we can instead clean up the index by dropping the entire - * index. - * Setting the flag may permit increased performance. - */ - virtual void setRollbackWritesDisabled() = 0; - virtual void setOrderedCommit(bool orderedCommit) = 0; protected: diff --git a/src/mongo/db/storage/recovery_unit_noop.h b/src/mongo/db/storage/recovery_unit_noop.h index 9713b6aa1ec..16916414b7e 100644 --- a/src/mongo/db/storage/recovery_unit_noop.h +++ b/src/mongo/db/storage/recovery_unit_noop.h @@ -71,11 +71,6 @@ public: _changes.push_back(std::unique_ptr<Change>(change)); } - virtual void* writingPtr(void* data, size_t len) { - return data; - } - virtual void setRollbackWritesDisabled() {} - virtual SnapshotId getSnapshotId() const { return SnapshotId(); } diff --git a/src/mongo/db/storage/storage_engine_init.cpp b/src/mongo/db/storage/storage_engine_init.cpp index 2c4a526e64b..c22e6915e52 100644 --- a/src/mongo/db/storage/storage_engine_init.cpp +++ b/src/mongo/db/storage/storage_engine_init.cpp @@ -126,12 +126,6 @@ void initializeStorageEngine(ServiceContext* service, const StorageEngineInitFla log() << startupWarningsLog; } - const std::string repairpath = storageGlobalParams.repairpath; - uassert(40311, - str::stream() << "Cannot start server. The command line option '--repairpath'" - << " is only supported by the mmapv1 storage engine", - repairpath.empty() || repairpath == dbpath || storageGlobalParams.engine == "mmapv1"); - const StorageEngine::Factory* factory = getFactoryForStorageEngine(service, storageGlobalParams.engine); diff --git a/src/mongo/db/storage/storage_engine_metadata.cpp b/src/mongo/db/storage/storage_engine_metadata.cpp index 1f1c061f4fa..8892baf1a1b 100644 --- a/src/mongo/db/storage/storage_engine_metadata.cpp +++ b/src/mongo/db/storage/storage_engine_metadata.cpp @@ -40,9 +40,15 @@ #include <ostream> #include <vector> +#ifdef __linux__ // Only needed by flushDirectory for Linux +#include <boost/filesystem/path.hpp> +#include <fcntl.h> +#include <sys/stat.h> +#include <sys/types.h> +#endif + #include "mongo/db/bson/dotted_path_support.h" #include "mongo/db/jsobj.h" -#include "mongo/db/storage/mmap_v1/paths.h" #include "mongo/util/assert_util.h" #include "mongo/util/file.h" #include "mongo/util/log.h" @@ -56,15 +62,6 @@ namespace { const std::string kMetadataBasename = "storage.bson"; -/** - * Returns true if local.ns is found in 'directory' or 'directory'/local/. - */ -bool containsMMapV1LocalNsFile(const std::string& directory) { - boost::filesystem::path directoryPath(directory); - return boost::filesystem::exists(directoryPath / "local.ns") || - boost::filesystem::exists((directoryPath / "local") / "local.ns"); -} - bool fsyncFile(boost::filesystem::path path) { invariant(path.has_filename()); File file; @@ -99,11 +96,6 @@ boost::optional<std::string> StorageEngineMetadata::getStorageEngineForPath( return {metadata->getStorageEngine()}; } - // Fallback to checking for MMAPv1-specific files to handle upgrades from before the - // storage.bson metadata file was introduced in 3.0. - if (containsMMapV1LocalNsFile(dbpath)) { - return {std::string("mmapv1")}; - } return {}; } @@ -221,6 +213,51 @@ Status StorageEngineMetadata::read() { return Status::OK(); } +void flushMyDirectory(const boost::filesystem::path& file) { +#ifdef __linux__ // this isn't needed elsewhere + static bool _warnedAboutFilesystem = false; + // if called without a fully qualified path it asserts; that makes mongoperf fail. + // so make a warning. need a better solution longer term. + // massert(13652, str::stream() << "Couldn't find parent dir for file: " << file.string(),); + if (!file.has_branch_path()) { + log() << "warning flushMyDirectory couldn't find parent dir for file: " << file.string(); + return; + } + + + boost::filesystem::path dir = file.branch_path(); // parent_path in new boosts + + LOG(1) << "flushing directory " << dir.string(); + + int fd = ::open(dir.string().c_str(), O_RDONLY); // DO NOT THROW OR ASSERT BEFORE CLOSING + massert(13650, + str::stream() << "Couldn't open directory '" << dir.string() << "' for flushing: " + << errnoWithDescription(), + fd >= 0); + if (fsync(fd) != 0) { + int e = errno; + if (e == EINVAL) { // indicates filesystem does not support synchronization + if (!_warnedAboutFilesystem) { + log() << "\tWARNING: This file system is not supported. For further information" + << " see:" << startupWarningsLog; + log() << "\t\t\thttp://dochub.mongodb.org/core/unsupported-filesystems" + << startupWarningsLog; + log() << "\t\tPlease notify MongoDB, Inc. if an unlisted filesystem generated " + << "this warning." << startupWarningsLog; + _warnedAboutFilesystem = true; + } + } else { + close(fd); + massert(13651, + str::stream() << "Couldn't fsync directory '" << dir.string() << "': " + << errnoWithDescription(e), + false); + } + } + close(fd); +#endif +} + Status StorageEngineMetadata::write() const { if (_storageEngine.empty()) { return Status(ErrorCodes::BadValue, diff --git a/src/mongo/db/storage/storage_options.h b/src/mongo/db/storage/storage_options.h index 57c58ea747a..144dd1eb77b 100644 --- a/src/mongo/db/storage/storage_options.h +++ b/src/mongo/db/storage/storage_options.h @@ -74,12 +74,6 @@ struct StorageGlobalParams { // running the repairDatabase database command on all databases. bool repair; - // --repairpath - // Specifies the root directory containing MongoDB data files to use for the --repair - // operation. - // Default: A _tmp directory within the path specified by the dbPath option. - std::string repairpath; - bool dur; // --dur durability (now --journal) // --journalCommitInterval diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp index b188b8de6a1..dd569219ab5 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp @@ -236,11 +236,6 @@ void WiredTigerRecoveryUnit::preallocateSnapshot() { getSession(); } -void* WiredTigerRecoveryUnit::writingPtr(void* data, size_t len) { - // This API should not be used for anything other than the MMAP V1 storage engine - MONGO_UNREACHABLE; -} - void WiredTigerRecoveryUnit::_txnClose(bool commit) { invariant(_active); WT_SESSION* s = _session->getSession(); diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h index 8bae90f0368..6eda5ef4588 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h @@ -102,10 +102,6 @@ public: ReadSource getTimestampReadSource() const override; - void* writingPtr(void* data, size_t len) override; - - void setRollbackWritesDisabled() override {} - virtual void setOrderedCommit(bool orderedCommit) override { _orderedCommit = orderedCommit; } diff --git a/src/mongo/dbtests/SConscript b/src/mongo/dbtests/SConscript index 790e17896c2..34d16d7dfea 100644 --- a/src/mongo/dbtests/SConscript +++ b/src/mongo/dbtests/SConscript @@ -11,7 +11,6 @@ env.Library( "framework_options.cpp", ], LIBDEPS=[ - '$BUILD_DIR/mongo/db/storage/mmap_v1/storage_mmapv1', '$BUILD_DIR/mongo/db/storage/storage_options', '$BUILD_DIR/mongo/util/options_parser/options_parser_init', '$BUILD_DIR/mongo/unittest/unittest', @@ -75,11 +74,9 @@ dbtest = env.Program( 'jstests.cpp', 'logical_sessions_tests.cpp', 'matchertests.cpp', - 'mmaptests.cpp', 'mock_dbclient_conn_test.cpp', 'mock_replica_set_test.cpp', 'multikey_paths_test.cpp', - 'namespacetests.cpp', 'oplogstarttests.cpp', 'pdfiletests.cpp', 'plan_ranking.cpp', @@ -138,7 +135,6 @@ dbtest = env.Program( "$BUILD_DIR/mongo/db/repl/storage_interface_impl", "$BUILD_DIR/mongo/db/serveronly", "$BUILD_DIR/mongo/db/sessions_collection_standalone", - "$BUILD_DIR/mongo/db/storage/mmap_v1/paths", "$BUILD_DIR/mongo/db/storage/kv/kv_engine_core", "$BUILD_DIR/mongo/transport/transport_layer_manager", "$BUILD_DIR/mongo/util/clock_source_mock", diff --git a/src/mongo/dbtests/basictests.cpp b/src/mongo/dbtests/basictests.cpp index 541d2c776ba..4e9832639e9 100644 --- a/src/mongo/dbtests/basictests.cpp +++ b/src/mongo/dbtests/basictests.cpp @@ -34,8 +34,6 @@ #include <iostream> #include "mongo/db/client.h" -#include "mongo/db/storage/mmap_v1/compress.h" -#include "mongo/db/storage/mmap_v1/paths.h" #include "mongo/dbtests/dbtests.h" #include "mongo/util/base64.h" #include "mongo/util/queue.h" @@ -382,36 +380,6 @@ public: } }; -class RelativePathTest { -public: - void run() { - RelativePath a = RelativePath::fromRelativePath("a"); - RelativePath b = RelativePath::fromRelativePath("a"); - RelativePath c = RelativePath::fromRelativePath("b"); - RelativePath d = RelativePath::fromRelativePath("a/b"); - - - ASSERT(a == b); - ASSERT(a != c); - ASSERT(a != d); - ASSERT(c != d); - } -}; - -struct CompressionTest1 { - void run() { - const char* c = "this is a test"; - std::string s; - size_t len = compress(c, strlen(c) + 1, &s); - verify(len > 0); - - std::string out; - bool ok = uncompress(s.c_str(), s.size(), &out); - verify(ok); - verify(strcmp(out.c_str(), c) == 0); - } -} ctest1; - class All : public Suite { public: All() : Suite("basic") {} @@ -436,9 +404,6 @@ public: add<StrTests>(); add<HostAndPortTests>(); - add<RelativePathTest>(); - - add<CompressionTest1>(); } }; diff --git a/src/mongo/dbtests/framework_options.cpp b/src/mongo/dbtests/framework_options.cpp index 192760718d4..d9929c5842c 100644 --- a/src/mongo/dbtests/framework_options.cpp +++ b/src/mongo/dbtests/framework_options.cpp @@ -39,7 +39,6 @@ #include "mongo/base/status.h" #include "mongo/bson/util/builder.h" #include "mongo/db/query/find.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" #include "mongo/db/storage/storage_options.h" #include "mongo/dbtests/dbtests.h" #include "mongo/unittest/unittest.h" @@ -152,18 +151,6 @@ Status storeTestFrameworkOptions(const moe::Environment& params, frameworkGlobalParams.perfHist = params["perfHist"].as<unsigned>(); } - bool nodur = false; - if (params.count("nodur")) { - nodur = true; - storageGlobalParams.dur = false; - } - if (params.count("dur") || storageGlobalParams.dur) { - storageGlobalParams.dur = true; - } - - if (params.count("nopreallocj")) { - mmapv1GlobalOptions.preallocj = false; - } if (params.count("debug") || params.count("verbose")) { logger::globalLogDomain()->setMinimumLoggedSeverity(logger::LogSeverity::Debug(1)); @@ -194,32 +181,11 @@ Status storeTestFrameworkOptions(const moe::Environment& params, return Status(ErrorCodes::BadValue, sb.str()); } + DEV log() << "DEBUG build" << endl; + string dbpathString = p.string(); storageGlobalParams.dbpath = dbpathString.c_str(); - mmapv1GlobalOptions.prealloc = false; - - // dbtest defaults to smallfiles - mmapv1GlobalOptions.smallfiles = true; - if (params.count("bigfiles")) { - storageGlobalParams.dur = true; - } - - DEV log() << "DEBUG build" << endl; - if (sizeof(void*) == 4) - log() << "32bit" << endl; - log() << "random seed: " << frameworkGlobalParams.seed << endl; - - if (time(0) % 3 == 0 && !nodur) { - if (!storageGlobalParams.dur) { - storageGlobalParams.dur = true; - log() << "****************" << endl; - log() << "running with journaling enabled to test that. dbtests will do this " - << "occasionally even if --dur is not specified." << endl; - log() << "****************" << endl; - } - } - storageGlobalParams.engine = params["storage.engine"].as<string>(); if (params.count("suites")) { @@ -231,13 +197,6 @@ Status storeTestFrameworkOptions(const moe::Environment& params, frameworkGlobalParams.filter = params["filter"].as<string>(); } - if (kDebugBuild && storageGlobalParams.dur) { - log() << "Debug Build: automatically enabling mmapv1GlobalOptions.journalOptions=8 " - << "(JournalParanoid)" << endl; - // this was commented out. why too slow or something? - mmapv1GlobalOptions.journalOptions |= MMAPV1Options::JournalParanoid; - } - return Status::OK(); } } diff --git a/src/mongo/dbtests/jsobjtests.cpp b/src/mongo/dbtests/jsobjtests.cpp index b670cab1dae..7dca7745a97 100644 --- a/src/mongo/dbtests/jsobjtests.cpp +++ b/src/mongo/dbtests/jsobjtests.cpp @@ -43,7 +43,6 @@ #include "mongo/db/bson/dotted_path_support.h" #include "mongo/db/jsobj.h" #include "mongo/db/json.h" -#include "mongo/db/storage/mmap_v1/btree/key.h" #include "mongo/dbtests/dbtests.h" #include "mongo/platform/decimal128.h" #include "mongo/util/allocator.h" @@ -168,63 +167,6 @@ FieldCompareResult compareDottedFieldNames(const string& l, const string& r, con namespace JsobjTests { -void keyTest(const BSONObj& o, bool mustBeCompact = false) { - static KeyV1Owned* kLast; - static BSONObj last; - - KeyV1Owned* key = new KeyV1Owned(o); - KeyV1Owned& k = *key; - - ASSERT(!mustBeCompact || k.isCompactFormat()); - - BSONObj x = k.toBson(); - int res = o.woCompare(x, BSONObj(), /*considerfieldname*/ false); - if (res) { - cout << o.toString() << endl; - k.toBson(); - cout << x.toString() << endl; - o.woCompare(x, BSONObj(), /*considerfieldname*/ false); - ASSERT(res == 0); - } - ASSERT(k.woEqual(k)); - ASSERT(!k.isCompactFormat() || k.dataSize() < o.objsize()); - - { - int res = o.woCompare(last); - ASSERT((res == 0) == SimpleBSONObjComparator::kInstance.evaluate(o == last)); - } - - if (kLast) { - int r1 = o.woCompare(last, BSONObj(), false); - int r2 = k.woCompare(*kLast, Ordering::make(BSONObj())); - bool ok = (r1 < 0 && r2 < 0) || (r1 > 0 && r2 > 0) || r1 == r2; - if (!ok) { - cout << "r1r2 " << r1 << ' ' << r2 << endl; - cout << "o:" << o.toString() << endl; - cout << "last:" << last.toString() << endl; - cout << "k:" << k.toString() << endl; - cout << "kLast:" << kLast->toString() << endl; - int r3 = k.woCompare(*kLast, Ordering::make(BSONObj())); - cout << r3 << endl; - } - ASSERT(ok); - if (k.isCompactFormat() && kLast->isCompactFormat()) { - // only check if not bson as bson woEqual is broken! (or was may2011) - if (k.woEqual(*kLast) != (r2 == 0)) { // check woEqual matches - cout << r2 << endl; - cout << k.toString() << endl; - cout << kLast->toString() << endl; - k.woEqual(*kLast); - ASSERT(false); - } - } - } - - delete kLast; - kLast = key; - last = o.getOwned(); -} - class BufBuilderBasic { public: void run() { @@ -481,31 +423,10 @@ public: key) < 0); { - // test a big key - string x(2000, 'z'); - BSONObj o = BSON("q" << x); - keyTest(o, false); - } - { - string y(200, 'w'); - BSONObjBuilder b; - for (int i = 0; i < 10; i++) { - b.append("x", y); - } - keyTest(b.obj(), true); - } - { - double nan = numeric_limits<double>::quiet_NaN(); - BSONObj o = BSON("y" << nan); - keyTest(o); - } - - { BSONObjBuilder b; b.append("", "c"); b.appendNull(""); BSONObj o = b.obj(); - keyTest(o); ASSERT(dps::compareObjectsAccordingToSort(o, BSON("" << "b" @@ -557,13 +478,6 @@ public: ASSERT(BSON("a" << nan).woCompare(BSON("a" << 5000000000LL)) < 0); - { - KeyV1Owned a(BSON("a" << nan)); - KeyV1Owned b(BSON("a" << 1)); - Ordering o = Ordering::make(BSON("a" << 1)); - ASSERT(a.woCompare(b, o) < 0); - } - ASSERT(BSON("a" << 1).woCompare(BSON("a" << nan)) > 0); ASSERT(BSON("a" << nan2).woCompare(BSON("a" << nan2)) == 0); @@ -644,41 +558,6 @@ struct AppendIntOrLL { void run() { const long long billion = 1000 * 1000 * 1000; - long long n = 0x3333111122224444LL; - { - double d = (double)n; - BSONObj a = BSON("x" << n); - BSONObj b = BSON("x" << d); - - long long back = (long long)d; - // 3719 - ////// int res = a.woCompare(b); - - ASSERT(n > back); - // ASSERT( res > 0 ); // SERVER-3719 - - keyTest(a, false); - - KeyV1Owned A(a); - KeyV1Owned B(b); - // 3719 - ////// int res2 = A.woCompare(B, Ordering::make(BSONObj())); - // ASSERT( res2 > 0 ); // SERVER-3719 - - // fixing requires an index v# change. - - cout << "todo fix SERVER-3719 and uncomment test in AppendIntOrLL" << endl; - - n++; - } - - { - BSONObjBuilder b; - b.appendIntOrLL("L4", -4 * billion); - keyTest(b.obj()); - keyTest(BSON("" << billion)); - } - BSONObjBuilder b; b.appendIntOrLL("i1", 1); b.appendIntOrLL("i2", -1); @@ -693,7 +572,6 @@ struct AppendIntOrLL { b.appendIntOrLL("L6", -16 * billion); BSONObj o = b.obj(); - keyTest(o); ASSERT(o["i1"].type() == NumberInt); ASSERT(o["i1"].number() == 1); @@ -730,7 +608,6 @@ struct AppendNumber { b.appendNumber("f", mongo::Decimal128("1")); BSONObj o = b.obj(); - keyTest(o); ASSERT(o["a"].type() == NumberInt); ASSERT(o["b"].type() == NumberDouble); @@ -741,161 +618,6 @@ struct AppendNumber { } }; -class ToStringArray { -public: - void run() { - string spec = "{ a: [ \"a\", \"b\" ] }"; - ASSERT_EQUALS(spec, fromjson(spec).toString()); - - BSONObj x = BSON("a" - << "astring" - << "b" - << "str"); - keyTest(x); - keyTest(x); - BSONObj y = BSON("a" - << "astring" - << "b" - << "stra"); - keyTest(y); - y = BSON("a" - << ""); - keyTest(y); - - keyTest(BSON("abc" << true)); - keyTest(BSON("abc" << false)); - keyTest(BSON("abc" << false << "b" << true)); - - Date_t now = jsTime(); - keyTest(BSON("" << now << "" << 3 << "" << jstNULL << "" << true)); - keyTest(BSON("" << now << "" << 3 << "" << BSONObj() << "" << true)); - - {{// check signed dates with new key format - KeyV1Owned a(BSONObjBuilder().appendDate("", Date_t::fromMillisSinceEpoch(-50)).obj()); - KeyV1Owned b(BSONObjBuilder().appendDate("", Date_t::fromMillisSinceEpoch(50)).obj()); - ASSERT(a.woCompare(b, Ordering::make(BSONObj())) < 0); - } - { - // backward compatibility - KeyBson a(BSONObjBuilder().appendDate("", Date_t::fromMillisSinceEpoch(-50)).obj()); - KeyBson b(BSONObjBuilder().appendDate("", Date_t::fromMillisSinceEpoch(50)).obj()); - ASSERT(a.woCompare(b, Ordering::make(BSONObj())) > 0); - } - { - // this is an uncompactable key: - BSONObj uc1 = BSONObjBuilder() - .appendDate("", Date_t::fromMillisSinceEpoch(-50)) - .appendCode("", "abc") - .obj(); - BSONObj uc2 = BSONObjBuilder() - .appendDate("", Date_t::fromMillisSinceEpoch(55)) - .appendCode("", "abc") - .obj(); - ASSERT(uc1.woCompare(uc2, Ordering::make(BSONObj())) < 0); - { - KeyV1Owned a(uc1); - KeyV1Owned b(uc2); - ASSERT(!a.isCompactFormat()); - ASSERT(a.woCompare(b, Ordering::make(BSONObj())) < 0); - } - { - KeyBson a(uc1); - KeyBson b(uc2); - ASSERT(!a.isCompactFormat()); - ASSERT(a.woCompare(b, Ordering::make(BSONObj())) > 0); - } - } -} - -{ - BSONObjBuilder b; - b.appendBinData("f", 8, (BinDataType)1, "aaaabbbb"); - b.appendBinData("e", 3, (BinDataType)1, "aaa"); - b.appendBinData("b", 1, (BinDataType)1, "x"); - BSONObj o = b.obj(); - keyTest(o, true); -} - -{ - // check (non)equality - BSONObj a = BSONObjBuilder().appendBinData("", 8, (BinDataType)1, "abcdefgh").obj(); - BSONObj b = BSONObjBuilder().appendBinData("", 8, (BinDataType)1, "abcdefgj").obj(); - ASSERT_BSONOBJ_NE(a, b); - int res_ab = a.woCompare(b); - ASSERT(res_ab != 0); - keyTest(a, true); - keyTest(b, true); - - // check subtypes do not equal - BSONObj c = BSONObjBuilder().appendBinData("", 8, (BinDataType)4, "abcdefgh").obj(); - BSONObj d = BSONObjBuilder().appendBinData("", 8, (BinDataType)0x81, "abcdefgh").obj(); - ASSERT_BSONOBJ_NE(a, c); - int res_ac = a.woCompare(c); - ASSERT(res_ac != 0); - keyTest(c, true); - ASSERT_BSONOBJ_NE(a, d); - int res_ad = a.woCompare(d); - ASSERT(res_ad != 0); - keyTest(d, true); - - KeyV1Owned A(a); - KeyV1Owned B(b); - KeyV1Owned C(c); - KeyV1Owned D(d); - ASSERT(!A.woEqual(B)); - ASSERT(A.woCompare(B, Ordering::make(BSONObj())) < 0 && res_ab < 0); - ASSERT(!A.woEqual(C)); - ASSERT(A.woCompare(C, Ordering::make(BSONObj())) < 0 && res_ac < 0); - ASSERT(!A.woEqual(D)); - ASSERT(A.woCompare(D, Ordering::make(BSONObj())) < 0 && res_ad < 0); -} - -{ - BSONObjBuilder b; - b.appendBinData("f", 33, (BinDataType)1, "123456789012345678901234567890123"); - BSONObj o = b.obj(); - keyTest(o, false); -} - -{ - for (int i = 1; i <= 3; i++) { - for (int j = 1; j <= 3; j++) { - BSONObjBuilder b; - b.appendBinData("f", i, (BinDataType)j, "abc"); - BSONObj o = b.obj(); - keyTest(o, j != ByteArrayDeprecated); - } - } -} - -{ - BSONObjBuilder b; - b.appendBinData("f", 1, (BinDataType)133, "a"); - BSONObj o = b.obj(); - keyTest(o, true); -} - -{ - BSONObjBuilder b; - b.append("AA", 3); - b.appendBinData("f", 0, (BinDataType)0, ""); - b.appendBinData("e", 3, (BinDataType)7, "aaa"); - b.appendBinData("b", 1, (BinDataType)128, "x"); - b.append("z", 3); - b.appendBinData("bb", 0, (BinDataType)129, "x"); - BSONObj o = b.obj(); - keyTest(o, true); -} - -{ - // 9 is not supported in compact format. so test a non-compact case here. - BSONObjBuilder b; - b.appendBinData("f", 9, (BinDataType)0, "aaaabbbbc"); - BSONObj o = b.obj(); - keyTest(o); -} -} -}; class ToStringNumber { public: @@ -915,7 +637,6 @@ public: b.append("i", -0.0); BSONObj x = b.obj(); - keyTest(x); ASSERT_EQUALS("4", x["a"].toString(false, true)); ASSERT_EQUALS("5.0", x["b"].toString(false, true)); @@ -973,7 +694,6 @@ public: b.append("b", z); b.appendAs(b.asTempObj()["a"], "c"); BSONObj o = b.obj(); - keyTest(o); stringstream ss; ss << 'a' << '\0' << 'b'; @@ -1352,8 +1072,6 @@ public: b.appendOID("b", 0, false); b.appendOID("c", 0, true); BSONObj o = b.obj(); - keyTest(o); - ASSERT(o["a"].__oid().toString() == "000000000000000000000000"); ASSERT(o["b"].__oid().toString() == "000000000000000000000000"); ASSERT(o["c"].__oid().toString() != "000000000000000000000000"); @@ -1951,8 +1669,6 @@ struct BSONArrayBuilderTest { BSONObj o = BSON("obj" << obj << "arr" << arr << "arr2" << BSONArray(obj) << "regex" << BSONRegEx("reg", "x")); - keyTest(o); - ASSERT_EQUALS(o["obj"].type(), Object); ASSERT_EQUALS(o["arr"].type(), Array); ASSERT_EQUALS(o["arr2"].type(), Array); @@ -2209,8 +1925,6 @@ public: void run() { BSONObj x = BSON("a" << BSON("b" << 1)); BSONObj y = BSON("a" << BSON("b" << 1.0)); - keyTest(x); - keyTest(y); ASSERT_BSONOBJ_EQ(x, y); ASSERT_EQUALS(0, x.woCompare(y)); } @@ -2325,7 +2039,6 @@ public: add<BSONObjTests::AsTempObj>(); add<BSONObjTests::AppendIntOrLL>(); add<BSONObjTests::AppendNumber>(); - add<BSONObjTests::ToStringArray>(); add<BSONObjTests::ToStringNumber>(); add<BSONObjTests::AppendAs>(); add<BSONObjTests::ToStringRecursionDepth>(); diff --git a/src/mongo/dbtests/mmaptests.cpp b/src/mongo/dbtests/mmaptests.cpp deleted file mode 100644 index bec7f072342..00000000000 --- a/src/mongo/dbtests/mmaptests.cpp +++ /dev/null @@ -1,312 +0,0 @@ -// @file mmaptests.cpp - -/** - * Copyright (C) 2008 10gen Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects - * for all of the code used other than as permitted herein. If you modify - * file(s) with this exception, you may extend this exception to your - * version of the file(s), but you are not obligated to do so. If you do not - * wish to do so, delete this exception statement from your version. If you - * delete this exception statement from all source files in the program, - * then also delete it in the license file. - */ - -#include "mongo/platform/basic.h" - -#include <boost/filesystem/operations.hpp> -#include <iostream> - -#include "mongo/db/concurrency/d_concurrency.h" -#include "mongo/db/concurrency/lock_state.h" -#include "mongo/db/service_context.h" -#include "mongo/db/storage/mmap_v1/data_file.h" -#include "mongo/db/storage/mmap_v1/durable_mapped_file.h" -#include "mongo/db/storage/mmap_v1/extent.h" -#include "mongo/db/storage/mmap_v1/extent_manager.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_extent_manager.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_options.h" -#include "mongo/db/storage/storage_options.h" -#include "mongo/dbtests/dbtests.h" -#include "mongo/util/scopeguard.h" -#include "mongo/util/timer.h" - -namespace MMapTests { - -using std::endl; -using std::string; - -class LeakTest { - const string fn; - const int optOld; - -public: - LeakTest() - : fn((boost::filesystem::path(storageGlobalParams.dbpath) / "testfile.map").string()), - optOld(mmapv1GlobalOptions.journalOptions) { - mmapv1GlobalOptions.journalOptions = 0; // DurParanoid doesn't make sense with this test - } - ~LeakTest() { - mmapv1GlobalOptions.journalOptions = optOld; - try { - boost::filesystem::remove(fn); - } catch (...) { - } - } - void run() { - try { - boost::filesystem::remove(fn); - } catch (...) { - } - - auto opCtx = cc().makeOperationContext(); - Lock::GlobalWrite lk(opCtx.get()); - - { - DurableMappedFile f(opCtx.get()); - ON_BLOCK_EXIT([&f, &opCtx] { - LockMongoFilesExclusive lock(opCtx.get()); - f.close(opCtx.get()); - }); - unsigned long long len = 256 * 1024 * 1024; - verify(f.create(opCtx.get(), fn, len)); - { - char* p = (char*)f.getView(); - verify(p); - // write something to the private view as a test - if (storageGlobalParams.dur) - privateViews.makeWritable(p, 6); - strcpy(p, "hello"); - } - if (storageGlobalParams.dur) { - char* w = (char*)f.view_write(); - strcpy(w + 6, "world"); - } - MongoFileFinder ff(opCtx.get()); - ASSERT(ff.findByPath(fn)); - ASSERT(ff.findByPath("asdf") == 0); - } - { - MongoFileFinder ff(opCtx.get()); - ASSERT(ff.findByPath(fn) == 0); - } - - int N = 10000; -#if !defined(_WIN32) && !defined(__linux__) - // seems this test is slow on OS X. - N = 100; -#endif - - // we make a lot here -- if we were leaking, presumably it would fail doing this many. - Timer t; - for (int i = 0; i < N; i++) { - // Every 4 iterations we pass the sequential hint. - DurableMappedFile f{opCtx.get(), - i % 4 == 1 ? MongoFile::Options::SEQUENTIAL - : MongoFile::Options::NONE}; - ON_BLOCK_EXIT([&f, &opCtx] { - LockMongoFilesExclusive lock(opCtx.get()); - f.close(opCtx.get()); - }); - verify(f.open(opCtx.get(), fn)); - { - char* p = (char*)f.getView(); - verify(p); - if (storageGlobalParams.dur) - privateViews.makeWritable(p, 4); - strcpy(p, "zzz"); - } - if (storageGlobalParams.dur) { - char* w = (char*)f.view_write(); - if (i % 2 == 0) - ++(*w); - verify(w[6] == 'w'); - } - } - if (t.millis() > 10000) { - mongo::unittest::log() << "warning: MMap LeakTest is unusually slow N:" << N << ' ' - << t.millis() << "ms" << endl; - } - } -}; - -class ExtentSizing { -public: - void run() { - MmapV1ExtentManager em("x", "x", false); - - ASSERT_EQUALS(em.maxSize(), em.quantizeExtentSize(em.maxSize())); - - // test that no matter what we start with, we always get to max extent size - for (int obj = 16; obj < BSONObjMaxUserSize; obj += 111) { - int sz = em.initialSize(obj); - - double totalExtentSize = sz; - - int numFiles = 1; - int sizeLeftInExtent = em.maxSize() - 1; - - for (int i = 0; i < 100; i++) { - sz = em.followupSize(obj, sz); - ASSERT(sz >= obj); - ASSERT(sz >= em.minSize()); - ASSERT(sz <= em.maxSize()); - ASSERT(sz <= em.maxSize()); - - totalExtentSize += sz; - - if (sz < sizeLeftInExtent) { - sizeLeftInExtent -= sz; - } else { - numFiles++; - sizeLeftInExtent = em.maxSize() - sz; - } - } - ASSERT_EQUALS(em.maxSize(), sz); - - double allocatedOnDisk = (double)numFiles * em.maxSize(); - - ASSERT((totalExtentSize / allocatedOnDisk) > .95); - - invariant(em.numFiles() == 0); - } - } -}; - -class All : public Suite { -public: - All() : Suite("mmap") {} - void setupTests() { - if (!getGlobalServiceContext()->getStorageEngine()->isMmapV1()) - return; - - add<LeakTest>(); - add<ExtentSizing>(); - } -}; - -SuiteInstance<All> myall; - -#if 0 - - class CopyOnWriteSpeedTest { - public: - void run() { - - string fn = "/tmp/testfile.map"; - boost::filesystem::remove(fn); - - MemoryMappedFile f; - char *p = (char *) f.create(fn, 1024 * 1024 * 1024, true); - verify(p); - strcpy(p, "hello"); - - { - void *x = f.testGetCopyOnWriteView(); - Timer tt; - for( int i = 11; i < 1000000000; i++ ) - p[i] = 'z'; - cout << "fill 1GB time: " << tt.millis() << "ms" << endl; - f.testCloseCopyOnWriteView(x); - } - - /* test a lot of view/unviews */ - { - Timer t; - - char *q; - for( int i = 0; i < 1000; i++ ) { - q = (char *) f.testGetCopyOnWriteView(); - verify( q ); - if( i == 999 ) { - strcpy(q+2, "there"); - } - f.testCloseCopyOnWriteView(q); - } - - cout << "view unview: " << t.millis() << "ms" << endl; - } - - f.flush(true); - - /* plain old mmaped writes */ - { - Timer t; - for( int i = 0; i < 10; i++ ) { - memset(p+100, 'c', 200 * 1024 * 1024); - } - cout << "traditional writes: " << t.millis() << "ms" << endl; - } - - f.flush(true); - - /* test doing some writes */ - { - Timer t; - char *q = (char *) f.testGetCopyOnWriteView(); - for( int i = 0; i < 10; i++ ) { - verify( q ); - memset(q+100, 'c', 200 * 1024 * 1024); - } - f.testCloseCopyOnWriteView(q); - - cout << "inc style some writes: " << t.millis() << "ms" << endl; - } - - /* test doing some writes */ - { - Timer t; - for( int i = 0; i < 10; i++ ) { - char *q = (char *) f.testGetCopyOnWriteView(); - verify( q ); - memset(q+100, 'c', 200 * 1024 * 1024); - f.testCloseCopyOnWriteView(q); - } - - cout << "some writes: " << t.millis() << "ms" << endl; - } - - /* more granular */ - { - Timer t; - for( int i = 0; i < 100; i++ ) { - char *q = (char *) f.testGetCopyOnWriteView(); - verify( q ); - memset(q+100, 'c', 20 * 1024 * 1024); - f.testCloseCopyOnWriteView(q); - } - - cout << "more granular some writes: " << t.millis() << "ms" << endl; - } - - p[10] = 0; - cout << p << endl; - } - }; - - class All : public Suite { - public: - All() : Suite( "mmap" ) {} - void setupTests() { - add< CopyOnWriteSpeedTest >(); - } - } myall; - -#endif -} // namespace MMapTests diff --git a/src/mongo/dbtests/namespacetests.cpp b/src/mongo/dbtests/namespacetests.cpp deleted file mode 100644 index 2d1144c327c..00000000000 --- a/src/mongo/dbtests/namespacetests.cpp +++ /dev/null @@ -1,664 +0,0 @@ -/** - * Copyright (C) 2008-2014 MongoDB Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License, version 3, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * As a special exception, the copyright holders give permission to link the - * code of portions of this program with the OpenSSL library under certain - * conditions as described in each individual source file and distribute - * linked combinations including the program with the OpenSSL library. You - * must comply with the GNU Affero General Public License in all respects - * for all of the code used other than as permitted herein. If you modify - * file(s) with this exception, you may extend this exception to your - * version of the file(s), but you are not obligated to do so. If you do not - * wish to do so, delete this exception statement from your version. If you - * delete this exception statement from all source files in the program, - * then also delete it in the license file. - */ - -#define MONGO_LOG_DEFAULT_COMPONENT ::mongo::logger::LogComponent::kDefault - -#include "mongo/platform/basic.h" - -#include <string> - -#include "mongo/bson/simple_bsonobj_comparator.h" -#include "mongo/db/catalog/collection.h" -#include "mongo/db/catalog/database_holder.h" -#include "mongo/db/client.h" -#include "mongo/db/db_raii.h" -#include "mongo/db/index/expression_keys_private.h" -#include "mongo/db/index_legacy.h" -#include "mongo/db/index_names.h" -#include "mongo/db/json.h" -#include "mongo/db/query/internal_plans.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace_details.h" -#include "mongo/db/storage/mmap_v1/catalog/namespace_details_rsv1_metadata.h" -#include "mongo/db/storage/mmap_v1/extent.h" -#include "mongo/db/storage/mmap_v1/extent_manager.h" -#include "mongo/db/storage/mmap_v1/mmap_v1_extent_manager.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_capped.h" -#include "mongo/db/storage/mmap_v1/record_store_v1_simple.h" -#include "mongo/db/storage/storage_engine.h" -#include "mongo/dbtests/dbtests.h" -#include "mongo/util/log.h" - -namespace NamespaceTests { - -using std::string; - -const int MinExtentSize = 4096; - -namespace MissingFieldTests { - -/** A missing field is represented as null in a btree index. */ -class BtreeIndexMissingField { -public: - void run() { - const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); - OperationContext& opCtx = *opCtxPtr; - BSONObj spec(BSON("key" << BSON("a" << 1))); - ASSERT_EQUALS(jstNULL, - IndexLegacy::getMissingField(&opCtx, NULL, spec).firstElement().type()); - } -}; - -/** A missing field is represented as null in a 2d index. */ -class TwoDIndexMissingField { -public: - void run() { - const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); - OperationContext& opCtx = *opCtxPtr; - BSONObj spec(BSON("key" << BSON("a" - << "2d"))); - ASSERT_EQUALS(jstNULL, - IndexLegacy::getMissingField(&opCtx, NULL, spec).firstElement().type()); - } -}; - -/** A missing field is represented with the hash of null in a hashed index. */ -class HashedIndexMissingField { -public: - void run() { - const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); - OperationContext& opCtx = *opCtxPtr; - BSONObj spec(BSON("key" << BSON("a" - << "hashed"))); - BSONObj nullObj = BSON("a" << BSONNULL); - - // Call getKeys on the nullObj. - BSONObjSet nullFieldKeySet = SimpleBSONObjComparator::kInstance.makeBSONObjSet(); - const CollatorInterface* collator = nullptr; - ExpressionKeysPrivate::getHashKeys(nullObj, "a", 0, 0, false, collator, &nullFieldKeySet); - BSONElement nullFieldFromKey = nullFieldKeySet.begin()->firstElement(); - - ASSERT_EQUALS(ExpressionKeysPrivate::makeSingleHashKey(nullObj.firstElement(), 0, 0), - nullFieldFromKey.Long()); - - BSONObj missingField = IndexLegacy::getMissingField(&opCtx, NULL, spec); - ASSERT_EQUALS(NumberLong, missingField.firstElement().type()); - ASSERT_BSONELT_EQ(nullFieldFromKey, missingField.firstElement()); - } -}; - -/** - * A missing field is represented with the hash of null in a hashed index. This hash value - * depends on the hash seed. - */ -class HashedIndexMissingFieldAlternateSeed { -public: - void run() { - const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); - OperationContext& opCtx = *opCtxPtr; - BSONObj spec(BSON("key" << BSON("a" - << "hashed") - << "seed" - << 0x5eed)); - BSONObj nullObj = BSON("a" << BSONNULL); - - BSONObjSet nullFieldKeySet = SimpleBSONObjComparator::kInstance.makeBSONObjSet(); - const CollatorInterface* collator = nullptr; - ExpressionKeysPrivate::getHashKeys( - nullObj, "a", 0x5eed, 0, false, collator, &nullFieldKeySet); - BSONElement nullFieldFromKey = nullFieldKeySet.begin()->firstElement(); - - ASSERT_EQUALS(ExpressionKeysPrivate::makeSingleHashKey(nullObj.firstElement(), 0x5eed, 0), - nullFieldFromKey.Long()); - - // Ensure that getMissingField recognizes that the seed is different (and returns - // the right key). - BSONObj missingField = IndexLegacy::getMissingField(&opCtx, NULL, spec); - ASSERT_EQUALS(NumberLong, missingField.firstElement().type()); - ASSERT_BSONELT_EQ(nullFieldFromKey, missingField.firstElement()); - } -}; - -} // namespace MissingFieldTests - -namespace NamespaceDetailsTests { -#if 0 // SERVER-13640 - - class Base { - const char *ns_; - Lock::GlobalWrite lk; - OldClientContext _context; - public: - Base( const char *ns = "unittests.NamespaceDetailsTests" ) : ns_( ns ) , _context( ns ) {} - virtual ~Base() { - const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); OperationContext& opCtx = *opCtxPtr; - if ( !nsd() ) - return; - _context.db()->dropCollection( &opCtx, ns() ); - } - protected: - void create() { - Lock::GlobalWrite lk; - const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); OperationContext& opCtx = *opCtxPtr; - - CollectionOptions collectionOptions; - ASSERT_OK(collectionOptions.parse(fromjson(spec()), - CollectionOptions::parseForCommand)); - ASSERT_OK(userCreateNS(&opCtx, db(), ns(), collectionOptions, false)); - } - virtual string spec() const = 0; - int nRecords() const { - int count = 0; - const Extent* ext; - for ( RecordId extLoc = nsd()->firstExtent(); - !extLoc.isNull(); - extLoc = ext->xnext) { - ext = extentManager()->getExtent(extLoc); - int fileNo = ext->firstRecord.a(); - if ( fileNo == -1 ) - continue; - for ( int recOfs = ext->firstRecord.getOfs(); recOfs != RecordId::NullOfs; - recOfs = recordStore()->recordFor(RecordId(fileNo, recOfs))->nextOfs() ) { - ++count; - } - } - ASSERT_EQUALS( count, nsd()->numRecords() ); - return count; - } - int nExtents() const { - int count = 0; - for ( RecordId extLoc = nsd()->firstExtent(); - !extLoc.isNull(); - extLoc = extentManager()->getExtent(extLoc)->xnext ) { - ++count; - } - return count; - } - const char *ns() const { - return ns_; - } - const NamespaceDetails *nsd() const { - Collection* c = collection(); - if ( !c ) - return NULL; - return c->detailsDeprecated(); - } - const RecordStore* recordStore() const { - Collection* c = collection(); - if ( !c ) - return NULL; - return c->getRecordStore(); - } - Database* db() const { - return _context.db(); - } - const ExtentManager* extentManager() const { - return db()->getExtentManager(); - } - Collection* collection() const { - return db()->getCollection( &opCtx, ns() ); - } - - static BSONObj bigObj() { - BSONObjBuilder b; - b.appendOID("_id", 0, true); - string as( 187, 'a' ); - b.append( "a", as ); - return b.obj(); - } - - }; - - class Create : public Base { - public: - void run() { - create(); - ASSERT( nsd() ); - ASSERT_EQUALS( 0, nRecords() ); - ASSERT( nsd()->firstExtent() == nsd()->capExtent() ); - RecordId initial = RecordId(); - initial.setInvalid(); - ASSERT( initial == nsd()->capFirstNewRecord() ); - } - virtual string spec() const { return "{\"capped\":true,\"size\":512,\"$nExtents\":1}"; } - }; - - class SingleAlloc : public Base { - public: - void run() { - const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); OperationContext& opCtx = *opCtxPtr; - create(); - BSONObj b = bigObj(); - ASSERT( collection()->insertDocument( &opCtx, b, true ).isOK() ); - ASSERT_EQUALS( 1, nRecords() ); - } - virtual string spec() const { return "{\"capped\":true,\"size\":512,\"$nExtents\":1}"; } - }; - - class Realloc : public Base { - public: - void run() { - const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); OperationContext& opCtx = *opCtxPtr; - create(); - - const int N = 20; - const int Q = 16; // these constants depend on the size of the bson object, the extent - // size allocated by the system too - RecordId l[ N ]; - for ( int i = 0; i < N; ++i ) { - BSONObj b = bigObj(); - StatusWith<RecordId> status = - ASSERT( collection()->insertDocument( &opCtx, b, true ).isOK() ); - l[ i ] = status.getValue(); - ASSERT( !l[ i ].isNull() ); - ASSERT( nRecords() <= Q ); - //ASSERT_EQUALS( 1 + i % 2, nRecords() ); - if ( i >= 16 ) - ASSERT( l[ i ] == l[ i - Q] ); - } - } - virtual string spec() const { return "{\"capped\":true,\"size\":512,\"$nExtents\":1}"; } - }; - - class TwoExtent : public Base { - public: - void run() { - const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); OperationContext& opCtx = *opCtxPtr; - create(); - ASSERT_EQUALS( 2, nExtents() ); - - RecordId l[ 8 ]; - for ( int i = 0; i < 8; ++i ) { - StatusWith<RecordId> status = - ASSERT( collection()->insertDocument( &opCtx, bigObj(), true ).isOK() ); - l[ i ] = status.getValue(); - ASSERT( !l[ i ].isNull() ); - //ASSERT_EQUALS( i < 2 ? i + 1 : 3 + i % 2, nRecords() ); - //if ( i > 3 ) - // ASSERT( l[ i ] == l[ i - 4 ] ); - } - ASSERT( nRecords() == 8 ); - - // Too big - BSONObjBuilder bob; - bob.appendOID( "_id", NULL, true ); - bob.append( "a", string( MinExtentSize + 500, 'a' ) ); // min extent size is now 4096 - BSONObj bigger = bob.done(); - ASSERT( !collection()->insertDocument( &opCtx, bigger, false ).isOK() ); - ASSERT_EQUALS( 0, nRecords() ); - } - private: - virtual string spec() const { - return "{\"capped\":true,\"size\":512,\"$nExtents\":2}"; - } - }; - - - BSONObj docForRecordSize( int size ) { - BSONObjBuilder b; - b.append( "_id", 5 ); - b.append( "x", string( size - Record::HeaderSize - 22, 'x' ) ); - BSONObj x = b.obj(); - ASSERT_EQUALS( Record::HeaderSize + x.objsize(), size ); - return x; - } - - /** - * alloc() does not quantize records in capped collections. - * NB: this actually tests that the code in Database::createCollection doesn't set - * PowerOf2Sizes for capped collections. - */ - class AllocCappedNotQuantized : public Base { - public: - void run() { - const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); OperationContext& opCtx = *opCtxPtr; - create(); - ASSERT( nsd()->isCapped() ); - ASSERT( !nsd()->isUserFlagSet( NamespaceDetails::Flag_UsePowerOf2Sizes ) ); - - StatusWith<RecordId> result = - collection()->insertDocument( &opCtx, docForRecordSize( 300 ), false ); - ASSERT( result.isOK() ); - Record* record = collection()->getRecordStore()->recordFor( result.getValue() ); - // Check that no quantization is performed. - ASSERT_EQUALS( 300, record->lengthWithHeaders() ); - } - virtual string spec() const { return "{capped:true,size:2048}"; } - }; - - - /* test NamespaceDetails::cappedTruncateAfter(const char *ns, RecordId loc) - */ - class TruncateCapped : public Base { - virtual string spec() const { - return "{\"capped\":true,\"size\":512,\"$nExtents\":2}"; - } - void pass(int p) { - const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); OperationContext& opCtx = *opCtxPtr; - create(); - ASSERT_EQUALS( 2, nExtents() ); - - BSONObj b = bigObj(); - - int N = MinExtentSize / b.objsize() * nExtents() + 5; - int T = N - 4; - - RecordId truncAt; - //RecordId l[ 8 ]; - for ( int i = 0; i < N; ++i ) { - BSONObj bb = bigObj(); - StatusWith<RecordId> status = collection()->insertDocument( &opCtx, bb, true ); - ASSERT( status.isOK() ); - RecordId a = status.getValue(); - if( T == i ) - truncAt = a; - ASSERT( !a.isNull() ); - /*ASSERT_EQUALS( i < 2 ? i + 1 : 3 + i % 2, nRecords() ); - if ( i > 3 ) - ASSERT( l[ i ] == l[ i - 4 ] );*/ - } - ASSERT( nRecords() < N ); - - RecordId last, first; - { - unique_ptr<Runner> runner(InternalPlanner::collectionScan(&opCtx, - ns(), - collection(), - InternalPlanner::BACKWARD)); - runner->getNext(NULL, &last); - ASSERT( !last.isNull() ); - } - { - unique_ptr<Runner> runner(InternalPlanner::collectionScan(&opCtx, - ns(), - collection(), - InternalPlanner::FORWARD)); - runner->getNext(NULL, &first); - ASSERT( !first.isNull() ); - ASSERT( first != last ) ; - } - - collection()->cappedTruncateAfter(&opCtx, truncAt, false); - ASSERT_EQUALS( collection()->numRecords() , 28u ); - - { - RecordId loc; - unique_ptr<Runner> runner(InternalPlanner::collectionScan(&opCtx, - ns(), - collection(), - InternalPlanner::FORWARD)); - runner->getNext(NULL, &loc); - ASSERT( first == loc); - } - { - unique_ptr<Runner> runner(InternalPlanner::collectionScan(&opCtx, - ns(), - collection(), - InternalPlanner::BACKWARD)); - RecordId loc; - runner->getNext(NULL, &loc); - ASSERT( last != loc ); - ASSERT( !last.isNull() ); - } - - // Too big - BSONObjBuilder bob; - bob.appendOID("_id", 0, true); - bob.append( "a", string( MinExtentSize + 300, 'a' ) ); - BSONObj bigger = bob.done(); - ASSERT( !collection()->insertDocument( &opCtx, bigger, true ).isOK() ); - ASSERT_EQUALS( 0, nRecords() ); - } - public: - void run() { -// log() << "******** NOT RUNNING TruncateCapped test yet ************" << endl; - pass(0); - } - }; -#endif // SERVER-13640 -#if 0 // XXXXXX - once RecordStore is clean, we can put this back - class Migrate : public Base { - public: - void run() { - create(); - nsd()->deletedListEntry( 2 ) = nsd()->cappedListOfAllDeletedRecords().drec()-> - nextDeleted().drec()->nextDeleted(); - nsd()->cappedListOfAllDeletedRecords().drec()->nextDeleted().drec()-> - nextDeleted().writing() = RecordId(); - nsd()->cappedLastDelRecLastExtent().Null(); - NamespaceDetails *d = nsd(); - - zero( &d->capExtent() ); - zero( &d->capFirstNewRecord() ); - - // this has a side effect of called NamespaceDetails::cappedCheckMigrate - db()->namespaceIndex().details( ns() ); - - ASSERT( nsd()->firstExtent() == nsd()->capExtent() ); - ASSERT( nsd()->capExtent().getOfs() != 0 ); - ASSERT( !nsd()->capFirstNewRecord().isValid() ); - int nDeleted = 0; - for ( RecordId i = nsd()->cappedListOfAllDeletedRecords(); - !i.isNull(); i = i.drec()->nextDeleted(), ++nDeleted ); - ASSERT_EQUALS( 10, nDeleted ); - ASSERT( nsd()->cappedLastDelRecLastExtent().isNull() ); - } - private: - static void zero( RecordId *d ) { - memset( d, 0, sizeof( RecordId ) ); - } - virtual string spec() const { - return "{\"capped\":true,\"size\":512,\"$nExtents\":10}"; - } - }; -#endif - -// This isn't a particularly useful test, and because it doesn't clean up -// after itself, /tmp/unittest needs to be cleared after running. -// class BigCollection : public Base { -// public: -// BigCollection() : Base( "NamespaceDetailsTests_BigCollection" ) {} -// void run() { -// create(); -// ASSERT_EQUALS( 2, nExtents() ); -// } -// private: -// virtual string spec() const { -// // NOTE 256 added to size in _userCreateNS() -// long long big = DataFile::maxSize() - DataFileHeader::HeaderSize; -// stringstream ss; -// ss << "{\"capped\":true,\"size\":" << big << "}"; -// return ss.str(); -// } -// }; - -#if 0 // SERVER-13640 - class SwapIndexEntriesTest : public Base { - public: - void run() { - create(); - NamespaceDetails *nsd = collection()->detailsWritable(); - - const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); OperationContext& opCtx = *opCtxPtr; - // Set 2 & 54 as multikey - nsd->setIndexIsMultikey(&opCtx, 2, true); - nsd->setIndexIsMultikey(&opCtx, 54, true); - ASSERT(nsd->isMultikey(2)); - ASSERT(nsd->isMultikey(54)); - - // Flip 2 & 47 - nsd->setIndexIsMultikey(&opCtx, 2, false); - nsd->setIndexIsMultikey(&opCtx, 47, true); - ASSERT(!nsd->isMultikey(2)); - ASSERT(nsd->isMultikey(47)); - - // Reset entries that are already true - nsd->setIndexIsMultikey(&opCtx, 54, true); - nsd->setIndexIsMultikey(&opCtx, 47, true); - ASSERT(nsd->isMultikey(54)); - ASSERT(nsd->isMultikey(47)); - - // Two non-multi-key - nsd->setIndexIsMultikey(&opCtx, 2, false); - nsd->setIndexIsMultikey(&opCtx, 43, false); - ASSERT(!nsd->isMultikey(2)); - ASSERT(nsd->isMultikey(54)); - ASSERT(nsd->isMultikey(47)); - ASSERT(!nsd->isMultikey(43)); - } - virtual string spec() const { return "{\"capped\":true,\"size\":512,\"$nExtents\":1}"; } - }; -#endif // SERVER-13640 -} // namespace NamespaceDetailsTests - -namespace DatabaseTests { - -class RollbackCreateCollection { -public: - void run() { - const string dbName = "rollback_create_collection"; - const string committedName = dbName + ".committed"; - const string rolledBackName = dbName + ".rolled_back"; - - const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); - OperationContext& opCtx = *opCtxPtr; - - Lock::DBLock lk(&opCtx, dbName, MODE_X); - - bool justCreated; - Database* db = DatabaseHolder::getDatabaseHolder().openDb(&opCtx, dbName, &justCreated); - ASSERT(justCreated); - - Collection* committedColl; - { - WriteUnitOfWork wunit(&opCtx); - ASSERT_FALSE(db->getCollection(&opCtx, committedName)); - committedColl = db->createCollection(&opCtx, committedName); - ASSERT_EQUALS(db->getCollection(&opCtx, committedName), committedColl); - wunit.commit(); - } - - ASSERT_EQUALS(db->getCollection(&opCtx, committedName), committedColl); - - { - WriteUnitOfWork wunit(&opCtx); - ASSERT_FALSE(db->getCollection(&opCtx, rolledBackName)); - Collection* rolledBackColl = db->createCollection(&opCtx, rolledBackName); - ASSERT_EQUALS(db->getCollection(&opCtx, rolledBackName), rolledBackColl); - // not committing so creation should be rolled back - } - - // The rolledBackCollection creation should have been rolled back - ASSERT_FALSE(db->getCollection(&opCtx, rolledBackName)); - - // The committedCollection should not have been affected by the rollback. Holders - // of the original Collection pointer should still be valid. - ASSERT_EQUALS(db->getCollection(&opCtx, committedName), committedColl); - } -}; - -class RollbackDropCollection { -public: - void run() { - const string dbName = "rollback_drop_collection"; - const string droppedName = dbName + ".dropped"; - const string rolledBackName = dbName + ".rolled_back"; - - const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); - OperationContext& opCtx = *opCtxPtr; - - Lock::DBLock lk(&opCtx, dbName, MODE_X); - - bool justCreated; - Database* db = DatabaseHolder::getDatabaseHolder().openDb(&opCtx, dbName, &justCreated); - ASSERT(justCreated); - - { - WriteUnitOfWork wunit(&opCtx); - ASSERT_FALSE(db->getCollection(&opCtx, droppedName)); - Collection* droppedColl; - droppedColl = db->createCollection(&opCtx, droppedName); - ASSERT_EQUALS(db->getCollection(&opCtx, droppedName), droppedColl); - db->dropCollection(&opCtx, droppedName).transitional_ignore(); - wunit.commit(); - } - - // Should have been really dropped - ASSERT_FALSE(db->getCollection(&opCtx, droppedName)); - - { - WriteUnitOfWork wunit(&opCtx); - ASSERT_FALSE(db->getCollection(&opCtx, rolledBackName)); - Collection* rolledBackColl = db->createCollection(&opCtx, rolledBackName); - wunit.commit(); - ASSERT_EQUALS(db->getCollection(&opCtx, rolledBackName), rolledBackColl); - db->dropCollection(&opCtx, rolledBackName).transitional_ignore(); - // not committing so dropping should be rolled back - } - - // The rolledBackCollection dropping should have been rolled back. - // Original Collection pointers are no longer valid. - ASSERT(db->getCollection(&opCtx, rolledBackName)); - - // The droppedCollection should not have been restored by the rollback. - ASSERT_FALSE(db->getCollection(&opCtx, droppedName)); - } -}; -} // namespace DatabaseTests - -class All : public Suite { -public: - All() : Suite("namespace") {} - - void setupTests() { - add<MissingFieldTests::BtreeIndexMissingField>(); - add<MissingFieldTests::TwoDIndexMissingField>(); - add<MissingFieldTests::HashedIndexMissingField>(); - add<MissingFieldTests::HashedIndexMissingFieldAlternateSeed>(); - -// add< NamespaceDetailsTests::Create >(); -// add< NamespaceDetailsTests::SingleAlloc >(); -// add< NamespaceDetailsTests::Realloc >(); -// add< NamespaceDetailsTests::AllocCappedNotQuantized >(); -// add< NamespaceDetailsTests::TwoExtent >(); -// add< NamespaceDetailsTests::TruncateCapped >(); -// add< NamespaceDetailsTests::Migrate >(); -// add< NamespaceDetailsTests::SwapIndexEntriesTest >(); -// add< NamespaceDetailsTests::BigCollection >(); - -#if 0 - // until ROLLBACK_ENABLED - add< DatabaseTests::RollbackCreateCollection >(); - add< DatabaseTests::RollbackDropCollection >(); -#endif - } -}; - -SuiteInstance<All> myall; - -} // namespace NamespaceTests diff --git a/src/mongo/embedded/embedded.cpp b/src/mongo/embedded/embedded.cpp index a7c5c6c719d..aa4b4133c5e 100644 --- a/src/mongo/embedded/embedded.cpp +++ b/src/mongo/embedded/embedded.cpp @@ -231,12 +231,6 @@ ServiceContext* initialize(const char* yaml_config) { uassert(50677, ss.str().c_str(), boost::filesystem::exists(storageGlobalParams.dbpath)); } - { - std::stringstream ss; - ss << "repairpath (" << storageGlobalParams.repairpath << ") does not exist"; - uassert(50678, ss.str().c_str(), boost::filesystem::exists(storageGlobalParams.repairpath)); - } - if (!storageGlobalParams.readOnly) { boost::filesystem::remove_all(storageGlobalParams.dbpath + "/_tmp/"); } diff --git a/src/mongo/embedded/embedded_options.cpp b/src/mongo/embedded/embedded_options.cpp index 524a751a5ee..b1e8792c346 100644 --- a/src/mongo/embedded/embedded_options.cpp +++ b/src/mongo/embedded/embedded_options.cpp @@ -77,11 +77,6 @@ Status addOptions(optionenvironment::OptionSection* options) { #endif - storage_options.addOptionChaining("storage.repairPath", - "repairpath", - optionenvironment::String, - "root directory for repair files - defaults to dbpath"); - options->addSection(general_options).transitional_ignore(); options->addSection(storage_options).transitional_ignore(); @@ -131,23 +126,6 @@ Status storeOptions(const moe::Environment& params) { } #endif - // needs to be after things like --configsvr parsing, thus here. - if (params.count("storage.repairPath")) { - storageGlobalParams.repairpath = params["storage.repairPath"].as<string>(); - if (!storageGlobalParams.repairpath.size()) { - return Status(ErrorCodes::BadValue, "repairpath is empty"); - } - - if (storageGlobalParams.dur && - !str::startsWith(storageGlobalParams.repairpath, storageGlobalParams.dbpath)) { - return Status(ErrorCodes::BadValue, - "You must use a --repairpath that is a subdirectory of --dbpath when " - "using journaling"); - } - } else { - storageGlobalParams.repairpath = storageGlobalParams.dbpath; - } - return Status::OK(); } |