diff options
author | Daniel Gottlieb <daniel.gottlieb@mongodb.com> | 2021-01-08 13:30:33 -0500 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-01-08 19:56:20 +0000 |
commit | ca040c1f469aa0ffd68e7a0605c10145e7fb65dc (patch) | |
tree | 292dbc3aa4f45741cc2b5de76741d5766e02c536 /src/mongo/db | |
parent | dae67dbc6db48733d8a4a6d50f07ef469735807c (diff) | |
download | mongo-ca040c1f469aa0ffd68e7a0605c10145e7fb65dc.tar.gz |
SERVER-46678: Utilize durable history across restarts.
Diffstat (limited to 'src/mongo/db')
-rw-r--r-- | src/mongo/db/catalog/index_catalog_impl.cpp | 13 | ||||
-rw-r--r-- | src/mongo/db/storage/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/db/storage/devnull/devnull_kv_engine.h | 8 | ||||
-rw-r--r-- | src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine.h | 4 | ||||
-rw-r--r-- | src/mongo/db/storage/kv/kv_drop_pending_ident_reaper_test.cpp | 8 | ||||
-rw-r--r-- | src/mongo/db/storage/kv/kv_engine.h | 4 | ||||
-rw-r--r-- | src/mongo/db/storage/storage_engine_impl.cpp | 57 | ||||
-rw-r--r-- | src/mongo/db/storage/storage_engine_impl.h | 3 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp | 66 |
9 files changed, 150 insertions, 14 deletions
diff --git a/src/mongo/db/catalog/index_catalog_impl.cpp b/src/mongo/db/catalog/index_catalog_impl.cpp index a84eb7f995d..3f7eb16770d 100644 --- a/src/mongo/db/catalog/index_catalog_impl.cpp +++ b/src/mongo/db/catalog/index_catalog_impl.cpp @@ -115,6 +115,12 @@ Status IndexCatalogImpl::init(OperationContext* opCtx) { const bool replSetMemberInStandaloneMode = getReplSetMemberInStandaloneMode(opCtx->getServiceContext()); + boost::optional<Timestamp> recoveryTs = boost::none; + if (auto storageEngine = opCtx->getServiceContext()->getStorageEngine(); + storageEngine->supportsRecoveryTimestamp()) { + recoveryTs = storageEngine->getRecoveryTimestamp(); + } + for (size_t i = 0; i < indexNames.size(); i++) { const string& indexName = indexNames[i]; BSONObj spec = @@ -162,6 +168,13 @@ Status IndexCatalogImpl::init(OperationContext* opCtx) { auto flags = CreateIndexEntryFlags::kInitFromDisk | CreateIndexEntryFlags::kIsReady; IndexCatalogEntry* entry = createIndexEntry(opCtx, std::move(descriptor), flags); fassert(17340, entry->isReady(opCtx)); + + // When initializing indexes from disk, we conservatively set the minimumVisibleSnapshot + // to non _id indexes to the recovery timestamp. The _id index is left visible. It's + // assumed if the collection is visible, it's _id is valid to be used. + if (recoveryTs && !entry->descriptor()->isIdIndex()) { + entry->setMinimumVisibleSnapshot(recoveryTs.get()); + } } } diff --git a/src/mongo/db/storage/SConscript b/src/mongo/db/storage/SConscript index e138a247664..68a312f54b9 100644 --- a/src/mongo/db/storage/SConscript +++ b/src/mongo/db/storage/SConscript @@ -566,6 +566,7 @@ env.Library( LIBDEPS=[ '$BUILD_DIR/mongo/base', '$BUILD_DIR/mongo/db/catalog/catalog_control', + '$BUILD_DIR/mongo/db/catalog_raii', '$BUILD_DIR/mongo/db/server_options_core', '$BUILD_DIR/mongo/db/storage/durable_catalog_impl', '$BUILD_DIR/mongo/db/storage/kv/kv_drop_pending_ident_reaper', diff --git a/src/mongo/db/storage/devnull/devnull_kv_engine.h b/src/mongo/db/storage/devnull/devnull_kv_engine.h index d83b1282b7e..f7a0fa47e4a 100644 --- a/src/mongo/db/storage/devnull/devnull_kv_engine.h +++ b/src/mongo/db/storage/devnull/devnull_kv_engine.h @@ -147,6 +147,14 @@ public: return boost::none; } + virtual Timestamp getOldestTimestamp() const override { + return Timestamp(); + } + + virtual boost::optional<Timestamp> getRecoveryTimestamp() const { + return boost::none; + } + private: std::shared_ptr<void> _catalogInfo; diff --git a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine.h b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine.h index 51c5ad46eb0..96a939c165a 100644 --- a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine.h +++ b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine.h @@ -189,6 +189,10 @@ public: std::map<Timestamp, std::shared_ptr<StringStore>> getHistory_forTest(); + boost::optional<Timestamp> getRecoveryTimestamp() const override { + return boost::none; + } + static bool instanceExists(); private: diff --git a/src/mongo/db/storage/kv/kv_drop_pending_ident_reaper_test.cpp b/src/mongo/db/storage/kv/kv_drop_pending_ident_reaper_test.cpp index f8dc1608bbb..842cceb8da9 100644 --- a/src/mongo/db/storage/kv/kv_drop_pending_ident_reaper_test.cpp +++ b/src/mongo/db/storage/kv/kv_drop_pending_ident_reaper_test.cpp @@ -116,6 +116,14 @@ public: return boost::none; } + Timestamp getOldestTimestamp() const override { + return Timestamp(); + } + + boost::optional<Timestamp> getRecoveryTimestamp() const { + return boost::none; + } + // List of ident names removed using dropIdent(). std::vector<std::string> droppedIdents; diff --git a/src/mongo/db/storage/kv/kv_engine.h b/src/mongo/db/storage/kv/kv_engine.h index ceea8e968e8..2476aea54c5 100644 --- a/src/mongo/db/storage/kv/kv_engine.h +++ b/src/mongo/db/storage/kv/kv_engine.h @@ -452,9 +452,7 @@ public: MONGO_UNREACHABLE; } - virtual Timestamp getOldestTimestamp() const { - MONGO_UNREACHABLE; - } + virtual Timestamp getOldestTimestamp() const = 0; virtual Timestamp getStableTimestamp() const { MONGO_UNREACHABLE; diff --git a/src/mongo/db/storage/storage_engine_impl.cpp b/src/mongo/db/storage/storage_engine_impl.cpp index 7ca561eeb1f..bb0df22fc68 100644 --- a/src/mongo/db/storage/storage_engine_impl.cpp +++ b/src/mongo/db/storage/storage_engine_impl.cpp @@ -36,6 +36,7 @@ #include "mongo/db/catalog/catalog_control.h" #include "mongo/db/catalog/collection_catalog.h" #include "mongo/db/catalog/collection_catalog_helper.h" +#include "mongo/db/catalog_raii.h" #include "mongo/db/client.h" #include "mongo/db/concurrency/d_concurrency.h" #include "mongo/db/index_builds_coordinator.h" @@ -156,6 +157,29 @@ void StorageEngineImpl::loadCatalog(OperationContext* opCtx, bool loadingFromUnc } std::vector<DurableCatalog::Entry> catalogEntries = _catalog->getAllCatalogEntries(opCtx); + + // Perform a read on the catalog at the `oldestTimestamp` and record the record stores (via + // their catalogId) that existed. + std::set<RecordId> existedAtOldestTs; + if (!_engine->getOldestTimestamp().isNull()) { + ReadSourceScope snapshotScope( + opCtx, RecoveryUnit::ReadSource::kProvided, _engine->getOldestTimestamp()); + auto entriesAtOldest = _catalog->getAllCatalogEntries(opCtx); + LOGV2_FOR_RECOVERY(5380110, + kCatalogLogLevel.toInt(), + "Catalog entries at the oldest timestamp", + "oldestTimestamp"_attr = _engine->getOldestTimestamp()); + for (auto entry : entriesAtOldest) { + existedAtOldestTs.insert(entry.catalogId); + LOGV2_FOR_RECOVERY(5380109, + kCatalogLogLevel.toInt(), + "Historical entry", + "catalogId"_attr = entry.catalogId, + "ident"_attr = entry.ident, + "namespace"_attr = entry.nss); + } + } + if (_options.forRepair) { // It's possible that there are collection files on disk that are unknown to the catalog. In // a repair context, if we can't find an ident in the catalog, we generate a catalog entry @@ -245,7 +269,23 @@ void StorageEngineImpl::loadCatalog(OperationContext* opCtx, bool loadingFromUnc } } - _initCollection(opCtx, entry.catalogId, entry.nss, _options.forRepair); + Timestamp minVisibleTs = Timestamp::min(); + // If there's no recovery timestamp, every collection is available. + if (boost::optional<Timestamp> recoveryTs = _engine->getRecoveryTimestamp()) { + // Otherwise choose a minimum visible timestamp that's at least as large as the true + // value. For every collection we will choose either the `oldestTimestamp` or the + // `recoveryTimestamp`. Choose the `oldestTimestamp` for collections that existed at the + // `oldestTimestamp` and conservatively choose the `recoveryTimestamp` for everything + // else. + minVisibleTs = recoveryTs.get(); + if (existedAtOldestTs.find(entry.catalogId) != existedAtOldestTs.end()) { + // Collections found at the `oldestTimestamp` on startup can have their minimum + // visible timestamp pulled back to that value. + minVisibleTs = _engine->getOldestTimestamp(); + } + } + + _initCollection(opCtx, entry.catalogId, entry.nss, _options.forRepair, minVisibleTs); auto maxPrefixForCollection = _catalog->getMetaData(opCtx, entry.catalogId).getMaxPrefix(); maxSeenPrefix = std::max(maxSeenPrefix, maxPrefixForCollection); @@ -264,7 +304,8 @@ void StorageEngineImpl::loadCatalog(OperationContext* opCtx, bool loadingFromUnc void StorageEngineImpl::_initCollection(OperationContext* opCtx, RecordId catalogId, const NamespaceString& nss, - bool forRepair) { + bool forRepair, + Timestamp minVisibleTs) { BSONCollectionCatalogEntry::MetaData md = _catalog->getMetaData(opCtx, catalogId); uassert(ErrorCodes::MustDowngrade, str::stream() << "Collection does not have UUID in KVCatalog. Collection: " << nss, @@ -286,6 +327,7 @@ void StorageEngineImpl::_initCollection(OperationContext* opCtx, auto collectionFactory = Collection::Factory::get(getGlobalServiceContext()); auto collection = collectionFactory->make(opCtx, nss, catalogId, uuid, std::move(rs)); + collection->setMinimumVisibleSnapshot(minVisibleTs); CollectionCatalog::write(opCtx, [&](CollectionCatalog& catalog) { catalog.registerCollection(opCtx, uuid, std::move(collection)); @@ -864,7 +906,10 @@ Status StorageEngineImpl::repairRecordStore(OperationContext* opCtx, auto uuid = catalog.lookupUUIDByNSS(opCtx, nss).get(); catalog.deregisterCollection(opCtx, uuid); }); - _initCollection(opCtx, catalogId, nss, false); + + // When repairing a record store, keep the existing behavior of not installing a minimum visible + // timestamp. + _initCollection(opCtx, catalogId, nss, false, Timestamp::min()); return status; } @@ -1022,9 +1067,9 @@ void StorageEngineImpl::_dumpCatalog(OperationContext* opCtx) { // not duplicate the log level policy. LOGV2_FOR_RECOVERY(4615634, kCatalogLogLevel.toInt(), - "Id: {rec_id} Value: {rec_data_toBson}", - "rec_id"_attr = rec->id, - "rec_data_toBson"_attr = rec->data.toBson()); + "Catalog entry", + "catalogId"_attr = rec->id, + "value"_attr = rec->data.toBson()); auto valueBson = rec->data.toBson(); if (valueBson.hasField("md")) { std::string ns = valueBson.getField("md").Obj().getField("ns").String(); diff --git a/src/mongo/db/storage/storage_engine_impl.h b/src/mongo/db/storage/storage_engine_impl.h index c5486dd492a..9055797e22e 100644 --- a/src/mongo/db/storage/storage_engine_impl.h +++ b/src/mongo/db/storage/storage_engine_impl.h @@ -375,7 +375,8 @@ private: void _initCollection(OperationContext* opCtx, RecordId catalogId, const NamespaceString& nss, - bool forRepair); + bool forRepair, + Timestamp minVisibleTs); Status _dropCollectionsNoTimestamp(OperationContext* opCtx, std::vector<NamespaceString>& toDrop); diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp index 2aa01e26d53..5bbffe3fa67 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp @@ -120,6 +120,8 @@ namespace { MONGO_FAIL_POINT_DEFINE(WTPreserveSnapshotHistoryIndefinitely); MONGO_FAIL_POINT_DEFINE(WTSetOldestTSToStableTS); +const std::string kPinOldestTimestampAtStartupName = "_wt_startup"; + } // namespace bool WiredTigerFileVersion::shouldDowngrade(bool readOnly, @@ -474,6 +476,35 @@ WiredTigerKVEngine::WiredTigerKVEngine(const std::string& canonicalName, "recoveryTimestamp"_attr = _recoveryTimestamp); } + { + char buf[(2 * 8 /*bytes in hex*/) + 1 /*nul terminator*/]; + int ret = _conn->query_timestamp(_conn, buf, "get=oldest"); + if (ret != WT_NOTFOUND) { + invariantWTOK(ret); + + std::uint64_t tmp; + fassert(5380107, NumberParser().base(16)(buf, &tmp)); + LOGV2_FOR_RECOVERY( + 5380106, 0, "WiredTiger oldestTimestamp", "oldestTimestamp"_attr = Timestamp(tmp)); + // The oldest timestamp is set in WT. Only set the in-memory variable. + _oldestTimestamp.store(tmp); + setInitialDataTimestamp(Timestamp(tmp)); + } + } + + // If there's no recovery timestamp, MDB has not produced a consistent snapshot of + // data. `_oldestTimestamp` and `_initialDataTimestamp` are only meaningful when there's a + // consistent snapshot of data. + // + // Note, this code is defensive (i.e: protects against a theorized, unobserved case) and is + // primarily concerned with restarts of a process that was performing an eMRC=off rollback via + // refetch. + if (_recoveryTimestamp.isNull() && _oldestTimestamp.load() > 0) { + LOGV2_FOR_RECOVERY(5380108, 0, "There is an oldestTimestamp without a recoveryTimestamp"); + _oldestTimestamp.store(0); + _initialDataTimestamp.store(0); + } + _sessionCache.reset(new WiredTigerSessionCache(this)); _sessionSweeper = std::make_unique<WiredTigerSessionSweeper>(_sessionCache.get()); @@ -485,8 +516,25 @@ WiredTigerKVEngine::WiredTigerKVEngine(const std::string& canonicalName, if (!_readOnly && !_ephemeral) { if (!_recoveryTimestamp.isNull()) { - setInitialDataTimestamp(_recoveryTimestamp); - setOldestTimestamp(_recoveryTimestamp, false); + // If the oldest/initial data timestamps were unset (there was no persisted durable + // history), initialize them to the recovery timestamp. + if (_oldestTimestamp.load() == 0) { + setInitialDataTimestamp(_recoveryTimestamp); + // Communicate the oldest timestamp to WT. + setOldestTimestamp(_recoveryTimestamp, false); + } + + // Pin the oldest timestamp prior to calling `setStableTimestamp` as that attempts to + // advance the oldest timestamp. We do this pinning to give features such as resharding + // an opportunity to re-pin the oldest timestamp after a restart. The assumptions this + // relies on are that: + // + // 1) The feature stores the desired pin timestamp in some local collection. + // 2) This temporary pinning lasts long enough for the catalog to be loaded and + // accessed. + uassertStatusOK(pinOldestTimestamp( + kPinOldestTimestampAtStartupName, Timestamp(_oldestTimestamp.load()), false)); + setStableTimestamp(_recoveryTimestamp, false); _sessionCache->snapshotManager().setLastApplied(_recoveryTimestamp); @@ -534,6 +582,7 @@ WiredTigerKVEngine::~WiredTigerKVEngine() { } void WiredTigerKVEngine::notifyStartupComplete() { + unpinOldestTimestamp(kPinOldestTimestampAtStartupName); WiredTigerUtil::notifyStartupComplete(); } @@ -666,7 +715,8 @@ void WiredTigerKVEngine::cleanShutdown() { 2, "Shutdown timestamps.", "Stable Timestamp"_attr = Timestamp(_stableTimestamp.load()), - "Initial Data Timestamp"_attr = Timestamp(_initialDataTimestamp.load())); + "Initial Data Timestamp"_attr = Timestamp(_initialDataTimestamp.load()), + "Oldest Timestamp"_attr = Timestamp(_oldestTimestamp.load())); _sizeStorer.reset(); _sessionCache->shuttingDown(); @@ -2070,6 +2120,11 @@ Timestamp WiredTigerKVEngine::_calculateHistoryLagFromStableTimestamp(Timestamp return Timestamp(); } + // The oldest timestamp cannot be set behind the `_initialDataTimestamp`. + if (calculatedOldestTimestamp.asULL() <= _initialDataTimestamp.load()) { + calculatedOldestTimestamp = Timestamp(_initialDataTimestamp.load()); + } + return calculatedOldestTimestamp; } @@ -2322,7 +2377,10 @@ void WiredTigerKVEngine::unpinOldestTimestamp(const std::string& requestingServi stdx::lock_guard<Latch> lock(_oldestTimestampPinRequestsMutex); auto it = _oldestTimestampPinRequests.find(requestingServiceName); if (it == _oldestTimestampPinRequests.end()) { - LOGV2_FATAL(5380105, "Missing pin request", "service"_attr = requestingServiceName); + LOGV2_WARNING(5380105, + "The requested service had nothing to unpin", + "service"_attr = requestingServiceName); + return; } LOGV2(5380103, "Unpin oldest timestamp request", |