summaryrefslogtreecommitdiff
path: root/src/mongo
diff options
context:
space:
mode:
authorDaniel Gottlieb <daniel.gottlieb@mongodb.com>2021-01-08 13:30:33 -0500
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-01-08 19:56:20 +0000
commitca040c1f469aa0ffd68e7a0605c10145e7fb65dc (patch)
tree292dbc3aa4f45741cc2b5de76741d5766e02c536 /src/mongo
parentdae67dbc6db48733d8a4a6d50f07ef469735807c (diff)
downloadmongo-ca040c1f469aa0ffd68e7a0605c10145e7fb65dc.tar.gz
SERVER-46678: Utilize durable history across restarts.
Diffstat (limited to 'src/mongo')
-rw-r--r--src/mongo/db/catalog/index_catalog_impl.cpp13
-rw-r--r--src/mongo/db/storage/SConscript1
-rw-r--r--src/mongo/db/storage/devnull/devnull_kv_engine.h8
-rw-r--r--src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine.h4
-rw-r--r--src/mongo/db/storage/kv/kv_drop_pending_ident_reaper_test.cpp8
-rw-r--r--src/mongo/db/storage/kv/kv_engine.h4
-rw-r--r--src/mongo/db/storage/storage_engine_impl.cpp57
-rw-r--r--src/mongo/db/storage/storage_engine_impl.h3
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp66
9 files changed, 150 insertions, 14 deletions
diff --git a/src/mongo/db/catalog/index_catalog_impl.cpp b/src/mongo/db/catalog/index_catalog_impl.cpp
index a84eb7f995d..3f7eb16770d 100644
--- a/src/mongo/db/catalog/index_catalog_impl.cpp
+++ b/src/mongo/db/catalog/index_catalog_impl.cpp
@@ -115,6 +115,12 @@ Status IndexCatalogImpl::init(OperationContext* opCtx) {
const bool replSetMemberInStandaloneMode =
getReplSetMemberInStandaloneMode(opCtx->getServiceContext());
+ boost::optional<Timestamp> recoveryTs = boost::none;
+ if (auto storageEngine = opCtx->getServiceContext()->getStorageEngine();
+ storageEngine->supportsRecoveryTimestamp()) {
+ recoveryTs = storageEngine->getRecoveryTimestamp();
+ }
+
for (size_t i = 0; i < indexNames.size(); i++) {
const string& indexName = indexNames[i];
BSONObj spec =
@@ -162,6 +168,13 @@ Status IndexCatalogImpl::init(OperationContext* opCtx) {
auto flags = CreateIndexEntryFlags::kInitFromDisk | CreateIndexEntryFlags::kIsReady;
IndexCatalogEntry* entry = createIndexEntry(opCtx, std::move(descriptor), flags);
fassert(17340, entry->isReady(opCtx));
+
+ // When initializing indexes from disk, we conservatively set the minimumVisibleSnapshot
+ // to non _id indexes to the recovery timestamp. The _id index is left visible. It's
+ // assumed if the collection is visible, it's _id is valid to be used.
+ if (recoveryTs && !entry->descriptor()->isIdIndex()) {
+ entry->setMinimumVisibleSnapshot(recoveryTs.get());
+ }
}
}
diff --git a/src/mongo/db/storage/SConscript b/src/mongo/db/storage/SConscript
index e138a247664..68a312f54b9 100644
--- a/src/mongo/db/storage/SConscript
+++ b/src/mongo/db/storage/SConscript
@@ -566,6 +566,7 @@ env.Library(
LIBDEPS=[
'$BUILD_DIR/mongo/base',
'$BUILD_DIR/mongo/db/catalog/catalog_control',
+ '$BUILD_DIR/mongo/db/catalog_raii',
'$BUILD_DIR/mongo/db/server_options_core',
'$BUILD_DIR/mongo/db/storage/durable_catalog_impl',
'$BUILD_DIR/mongo/db/storage/kv/kv_drop_pending_ident_reaper',
diff --git a/src/mongo/db/storage/devnull/devnull_kv_engine.h b/src/mongo/db/storage/devnull/devnull_kv_engine.h
index d83b1282b7e..f7a0fa47e4a 100644
--- a/src/mongo/db/storage/devnull/devnull_kv_engine.h
+++ b/src/mongo/db/storage/devnull/devnull_kv_engine.h
@@ -147,6 +147,14 @@ public:
return boost::none;
}
+ virtual Timestamp getOldestTimestamp() const override {
+ return Timestamp();
+ }
+
+ virtual boost::optional<Timestamp> getRecoveryTimestamp() const {
+ return boost::none;
+ }
+
private:
std::shared_ptr<void> _catalogInfo;
diff --git a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine.h b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine.h
index 51c5ad46eb0..96a939c165a 100644
--- a/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine.h
+++ b/src/mongo/db/storage/ephemeral_for_test/ephemeral_for_test_kv_engine.h
@@ -189,6 +189,10 @@ public:
std::map<Timestamp, std::shared_ptr<StringStore>> getHistory_forTest();
+ boost::optional<Timestamp> getRecoveryTimestamp() const override {
+ return boost::none;
+ }
+
static bool instanceExists();
private:
diff --git a/src/mongo/db/storage/kv/kv_drop_pending_ident_reaper_test.cpp b/src/mongo/db/storage/kv/kv_drop_pending_ident_reaper_test.cpp
index f8dc1608bbb..842cceb8da9 100644
--- a/src/mongo/db/storage/kv/kv_drop_pending_ident_reaper_test.cpp
+++ b/src/mongo/db/storage/kv/kv_drop_pending_ident_reaper_test.cpp
@@ -116,6 +116,14 @@ public:
return boost::none;
}
+ Timestamp getOldestTimestamp() const override {
+ return Timestamp();
+ }
+
+ boost::optional<Timestamp> getRecoveryTimestamp() const {
+ return boost::none;
+ }
+
// List of ident names removed using dropIdent().
std::vector<std::string> droppedIdents;
diff --git a/src/mongo/db/storage/kv/kv_engine.h b/src/mongo/db/storage/kv/kv_engine.h
index ceea8e968e8..2476aea54c5 100644
--- a/src/mongo/db/storage/kv/kv_engine.h
+++ b/src/mongo/db/storage/kv/kv_engine.h
@@ -452,9 +452,7 @@ public:
MONGO_UNREACHABLE;
}
- virtual Timestamp getOldestTimestamp() const {
- MONGO_UNREACHABLE;
- }
+ virtual Timestamp getOldestTimestamp() const = 0;
virtual Timestamp getStableTimestamp() const {
MONGO_UNREACHABLE;
diff --git a/src/mongo/db/storage/storage_engine_impl.cpp b/src/mongo/db/storage/storage_engine_impl.cpp
index 7ca561eeb1f..bb0df22fc68 100644
--- a/src/mongo/db/storage/storage_engine_impl.cpp
+++ b/src/mongo/db/storage/storage_engine_impl.cpp
@@ -36,6 +36,7 @@
#include "mongo/db/catalog/catalog_control.h"
#include "mongo/db/catalog/collection_catalog.h"
#include "mongo/db/catalog/collection_catalog_helper.h"
+#include "mongo/db/catalog_raii.h"
#include "mongo/db/client.h"
#include "mongo/db/concurrency/d_concurrency.h"
#include "mongo/db/index_builds_coordinator.h"
@@ -156,6 +157,29 @@ void StorageEngineImpl::loadCatalog(OperationContext* opCtx, bool loadingFromUnc
}
std::vector<DurableCatalog::Entry> catalogEntries = _catalog->getAllCatalogEntries(opCtx);
+
+ // Perform a read on the catalog at the `oldestTimestamp` and record the record stores (via
+ // their catalogId) that existed.
+ std::set<RecordId> existedAtOldestTs;
+ if (!_engine->getOldestTimestamp().isNull()) {
+ ReadSourceScope snapshotScope(
+ opCtx, RecoveryUnit::ReadSource::kProvided, _engine->getOldestTimestamp());
+ auto entriesAtOldest = _catalog->getAllCatalogEntries(opCtx);
+ LOGV2_FOR_RECOVERY(5380110,
+ kCatalogLogLevel.toInt(),
+ "Catalog entries at the oldest timestamp",
+ "oldestTimestamp"_attr = _engine->getOldestTimestamp());
+ for (auto entry : entriesAtOldest) {
+ existedAtOldestTs.insert(entry.catalogId);
+ LOGV2_FOR_RECOVERY(5380109,
+ kCatalogLogLevel.toInt(),
+ "Historical entry",
+ "catalogId"_attr = entry.catalogId,
+ "ident"_attr = entry.ident,
+ "namespace"_attr = entry.nss);
+ }
+ }
+
if (_options.forRepair) {
// It's possible that there are collection files on disk that are unknown to the catalog. In
// a repair context, if we can't find an ident in the catalog, we generate a catalog entry
@@ -245,7 +269,23 @@ void StorageEngineImpl::loadCatalog(OperationContext* opCtx, bool loadingFromUnc
}
}
- _initCollection(opCtx, entry.catalogId, entry.nss, _options.forRepair);
+ Timestamp minVisibleTs = Timestamp::min();
+ // If there's no recovery timestamp, every collection is available.
+ if (boost::optional<Timestamp> recoveryTs = _engine->getRecoveryTimestamp()) {
+ // Otherwise choose a minimum visible timestamp that's at least as large as the true
+ // value. For every collection we will choose either the `oldestTimestamp` or the
+ // `recoveryTimestamp`. Choose the `oldestTimestamp` for collections that existed at the
+ // `oldestTimestamp` and conservatively choose the `recoveryTimestamp` for everything
+ // else.
+ minVisibleTs = recoveryTs.get();
+ if (existedAtOldestTs.find(entry.catalogId) != existedAtOldestTs.end()) {
+ // Collections found at the `oldestTimestamp` on startup can have their minimum
+ // visible timestamp pulled back to that value.
+ minVisibleTs = _engine->getOldestTimestamp();
+ }
+ }
+
+ _initCollection(opCtx, entry.catalogId, entry.nss, _options.forRepair, minVisibleTs);
auto maxPrefixForCollection = _catalog->getMetaData(opCtx, entry.catalogId).getMaxPrefix();
maxSeenPrefix = std::max(maxSeenPrefix, maxPrefixForCollection);
@@ -264,7 +304,8 @@ void StorageEngineImpl::loadCatalog(OperationContext* opCtx, bool loadingFromUnc
void StorageEngineImpl::_initCollection(OperationContext* opCtx,
RecordId catalogId,
const NamespaceString& nss,
- bool forRepair) {
+ bool forRepair,
+ Timestamp minVisibleTs) {
BSONCollectionCatalogEntry::MetaData md = _catalog->getMetaData(opCtx, catalogId);
uassert(ErrorCodes::MustDowngrade,
str::stream() << "Collection does not have UUID in KVCatalog. Collection: " << nss,
@@ -286,6 +327,7 @@ void StorageEngineImpl::_initCollection(OperationContext* opCtx,
auto collectionFactory = Collection::Factory::get(getGlobalServiceContext());
auto collection = collectionFactory->make(opCtx, nss, catalogId, uuid, std::move(rs));
+ collection->setMinimumVisibleSnapshot(minVisibleTs);
CollectionCatalog::write(opCtx, [&](CollectionCatalog& catalog) {
catalog.registerCollection(opCtx, uuid, std::move(collection));
@@ -864,7 +906,10 @@ Status StorageEngineImpl::repairRecordStore(OperationContext* opCtx,
auto uuid = catalog.lookupUUIDByNSS(opCtx, nss).get();
catalog.deregisterCollection(opCtx, uuid);
});
- _initCollection(opCtx, catalogId, nss, false);
+
+ // When repairing a record store, keep the existing behavior of not installing a minimum visible
+ // timestamp.
+ _initCollection(opCtx, catalogId, nss, false, Timestamp::min());
return status;
}
@@ -1022,9 +1067,9 @@ void StorageEngineImpl::_dumpCatalog(OperationContext* opCtx) {
// not duplicate the log level policy.
LOGV2_FOR_RECOVERY(4615634,
kCatalogLogLevel.toInt(),
- "Id: {rec_id} Value: {rec_data_toBson}",
- "rec_id"_attr = rec->id,
- "rec_data_toBson"_attr = rec->data.toBson());
+ "Catalog entry",
+ "catalogId"_attr = rec->id,
+ "value"_attr = rec->data.toBson());
auto valueBson = rec->data.toBson();
if (valueBson.hasField("md")) {
std::string ns = valueBson.getField("md").Obj().getField("ns").String();
diff --git a/src/mongo/db/storage/storage_engine_impl.h b/src/mongo/db/storage/storage_engine_impl.h
index c5486dd492a..9055797e22e 100644
--- a/src/mongo/db/storage/storage_engine_impl.h
+++ b/src/mongo/db/storage/storage_engine_impl.h
@@ -375,7 +375,8 @@ private:
void _initCollection(OperationContext* opCtx,
RecordId catalogId,
const NamespaceString& nss,
- bool forRepair);
+ bool forRepair,
+ Timestamp minVisibleTs);
Status _dropCollectionsNoTimestamp(OperationContext* opCtx,
std::vector<NamespaceString>& toDrop);
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
index 2aa01e26d53..5bbffe3fa67 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp
@@ -120,6 +120,8 @@ namespace {
MONGO_FAIL_POINT_DEFINE(WTPreserveSnapshotHistoryIndefinitely);
MONGO_FAIL_POINT_DEFINE(WTSetOldestTSToStableTS);
+const std::string kPinOldestTimestampAtStartupName = "_wt_startup";
+
} // namespace
bool WiredTigerFileVersion::shouldDowngrade(bool readOnly,
@@ -474,6 +476,35 @@ WiredTigerKVEngine::WiredTigerKVEngine(const std::string& canonicalName,
"recoveryTimestamp"_attr = _recoveryTimestamp);
}
+ {
+ char buf[(2 * 8 /*bytes in hex*/) + 1 /*nul terminator*/];
+ int ret = _conn->query_timestamp(_conn, buf, "get=oldest");
+ if (ret != WT_NOTFOUND) {
+ invariantWTOK(ret);
+
+ std::uint64_t tmp;
+ fassert(5380107, NumberParser().base(16)(buf, &tmp));
+ LOGV2_FOR_RECOVERY(
+ 5380106, 0, "WiredTiger oldestTimestamp", "oldestTimestamp"_attr = Timestamp(tmp));
+ // The oldest timestamp is set in WT. Only set the in-memory variable.
+ _oldestTimestamp.store(tmp);
+ setInitialDataTimestamp(Timestamp(tmp));
+ }
+ }
+
+ // If there's no recovery timestamp, MDB has not produced a consistent snapshot of
+ // data. `_oldestTimestamp` and `_initialDataTimestamp` are only meaningful when there's a
+ // consistent snapshot of data.
+ //
+ // Note, this code is defensive (i.e: protects against a theorized, unobserved case) and is
+ // primarily concerned with restarts of a process that was performing an eMRC=off rollback via
+ // refetch.
+ if (_recoveryTimestamp.isNull() && _oldestTimestamp.load() > 0) {
+ LOGV2_FOR_RECOVERY(5380108, 0, "There is an oldestTimestamp without a recoveryTimestamp");
+ _oldestTimestamp.store(0);
+ _initialDataTimestamp.store(0);
+ }
+
_sessionCache.reset(new WiredTigerSessionCache(this));
_sessionSweeper = std::make_unique<WiredTigerSessionSweeper>(_sessionCache.get());
@@ -485,8 +516,25 @@ WiredTigerKVEngine::WiredTigerKVEngine(const std::string& canonicalName,
if (!_readOnly && !_ephemeral) {
if (!_recoveryTimestamp.isNull()) {
- setInitialDataTimestamp(_recoveryTimestamp);
- setOldestTimestamp(_recoveryTimestamp, false);
+ // If the oldest/initial data timestamps were unset (there was no persisted durable
+ // history), initialize them to the recovery timestamp.
+ if (_oldestTimestamp.load() == 0) {
+ setInitialDataTimestamp(_recoveryTimestamp);
+ // Communicate the oldest timestamp to WT.
+ setOldestTimestamp(_recoveryTimestamp, false);
+ }
+
+ // Pin the oldest timestamp prior to calling `setStableTimestamp` as that attempts to
+ // advance the oldest timestamp. We do this pinning to give features such as resharding
+ // an opportunity to re-pin the oldest timestamp after a restart. The assumptions this
+ // relies on are that:
+ //
+ // 1) The feature stores the desired pin timestamp in some local collection.
+ // 2) This temporary pinning lasts long enough for the catalog to be loaded and
+ // accessed.
+ uassertStatusOK(pinOldestTimestamp(
+ kPinOldestTimestampAtStartupName, Timestamp(_oldestTimestamp.load()), false));
+
setStableTimestamp(_recoveryTimestamp, false);
_sessionCache->snapshotManager().setLastApplied(_recoveryTimestamp);
@@ -534,6 +582,7 @@ WiredTigerKVEngine::~WiredTigerKVEngine() {
}
void WiredTigerKVEngine::notifyStartupComplete() {
+ unpinOldestTimestamp(kPinOldestTimestampAtStartupName);
WiredTigerUtil::notifyStartupComplete();
}
@@ -666,7 +715,8 @@ void WiredTigerKVEngine::cleanShutdown() {
2,
"Shutdown timestamps.",
"Stable Timestamp"_attr = Timestamp(_stableTimestamp.load()),
- "Initial Data Timestamp"_attr = Timestamp(_initialDataTimestamp.load()));
+ "Initial Data Timestamp"_attr = Timestamp(_initialDataTimestamp.load()),
+ "Oldest Timestamp"_attr = Timestamp(_oldestTimestamp.load()));
_sizeStorer.reset();
_sessionCache->shuttingDown();
@@ -2070,6 +2120,11 @@ Timestamp WiredTigerKVEngine::_calculateHistoryLagFromStableTimestamp(Timestamp
return Timestamp();
}
+ // The oldest timestamp cannot be set behind the `_initialDataTimestamp`.
+ if (calculatedOldestTimestamp.asULL() <= _initialDataTimestamp.load()) {
+ calculatedOldestTimestamp = Timestamp(_initialDataTimestamp.load());
+ }
+
return calculatedOldestTimestamp;
}
@@ -2322,7 +2377,10 @@ void WiredTigerKVEngine::unpinOldestTimestamp(const std::string& requestingServi
stdx::lock_guard<Latch> lock(_oldestTimestampPinRequestsMutex);
auto it = _oldestTimestampPinRequests.find(requestingServiceName);
if (it == _oldestTimestampPinRequests.end()) {
- LOGV2_FATAL(5380105, "Missing pin request", "service"_attr = requestingServiceName);
+ LOGV2_WARNING(5380105,
+ "The requested service had nothing to unpin",
+ "service"_attr = requestingServiceName);
+ return;
}
LOGV2(5380103,
"Unpin oldest timestamp request",