summaryrefslogtreecommitdiff
path: root/src/mongo/db/storage
diff options
context:
space:
mode:
authorLouis Williams <louis.williams@mongodb.com>2020-09-15 12:43:48 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-09-15 17:10:52 +0000
commita68bbe9d37f0c576295016919c9e0595c4ec1427 (patch)
tree2b0fac4bc73bb554ae145cd07409c931c7c0dbc6 /src/mongo/db/storage
parent2b904bb7d987b1538dce3d1800d97ae73cd791b4 (diff)
downloadmongo-a68bbe9d37f0c576295016919c9e0595c4ec1427.tar.gz
SERVER-48452 Internal readers should default to reading without a timestamp
Removes ReadSource::kUnset in favor of kNoTimestamp as the default Makes the following behavioral changes to AutoGetCollectionForRead: * Removes special early-return handling for kNoTimestamp * Only user or DBDirectClient operations are eligible to read at kLastApplied. * Operations only read at kLastApplied when in the SECONDARY state, nothing else. This means most internal operations that use DBDirectClient do not need to use a ReadSourceScope to ensure they read at kNoTimestamp. (cherry picked from commit 11c68393df88a6f1ea4855e6ac15e54ca9f9d976)
Diffstat (limited to 'src/mongo/db/storage')
-rw-r--r--src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp4
-rw-r--r--src/mongo/db/storage/recovery_unit.h10
-rw-r--r--src/mongo/db/storage/snapshot_helper.cpp74
-rw-r--r--src/mongo/db/storage/snapshot_helper.h4
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp4
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h2
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp5
7 files changed, 62 insertions, 41 deletions
diff --git a/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp b/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp
index baecec3c8af..26ade4d6cfe 100644
--- a/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp
+++ b/src/mongo/db/storage/kv/kv_engine_timestamps_test.cpp
@@ -372,7 +372,7 @@ TEST_F(SnapshotManagerTests, InsertAndReadOnLastAppliedSnapshot) {
// Not reading on the last applied timestamp returns the most recent data.
auto op = makeOperation();
auto ru = op->recoveryUnit();
- ru->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
+ ru->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
ASSERT_EQ(itCountOn(op), 1);
ASSERT(readRecordOn(op, id));
@@ -408,7 +408,7 @@ TEST_F(SnapshotManagerTests, UpdateAndDeleteOnLocalSnapshot) {
// Not reading on the last local timestamp returns the most recent data.
auto op = makeOperation();
auto ru = op->recoveryUnit();
- ru->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
+ ru->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
ASSERT_EQ(itCountOn(op), 1);
auto record = readRecordOn(op, id);
ASSERT_EQ(std::string(record->data.data()), "Blue spotted stingray");
diff --git a/src/mongo/db/storage/recovery_unit.h b/src/mongo/db/storage/recovery_unit.h
index e98452c20f2..96d5ec25f66 100644
--- a/src/mongo/db/storage/recovery_unit.h
+++ b/src/mongo/db/storage/recovery_unit.h
@@ -382,11 +382,7 @@ public:
*/
enum ReadSource {
/**
- * Do not read from a timestamp. This is the default.
- */
- kUnset,
- /**
- * Read without a timestamp explicitly.
+ * Read without a timestamp. This is the default.
*/
kNoTimestamp,
/**
@@ -414,8 +410,6 @@ public:
static std::string toString(ReadSource rs) {
switch (rs) {
- case ReadSource::kUnset:
- return "kUnset";
case ReadSource::kNoTimestamp:
return "kNoTimestamp";
case ReadSource::kMajorityCommitted:
@@ -445,7 +439,7 @@ public:
boost::optional<Timestamp> provided = boost::none) {}
virtual ReadSource getTimestampReadSource() const {
- return ReadSource::kUnset;
+ return ReadSource::kNoTimestamp;
};
/**
diff --git a/src/mongo/db/storage/snapshot_helper.cpp b/src/mongo/db/storage/snapshot_helper.cpp
index 57a1c3b99e6..913248d7ad7 100644
--- a/src/mongo/db/storage/snapshot_helper.cpp
+++ b/src/mongo/db/storage/snapshot_helper.cpp
@@ -38,29 +38,37 @@
#include "mongo/logv2/log.h"
namespace mongo {
-namespace SnapshotHelper {
-bool canSwitchReadSource(OperationContext* opCtx) {
-
- // Most readConcerns have behavior controlled at higher levels. Local and available are the only
- // ReadConcerns that should consider changing, since they read without a timestamp by default.
+namespace {
+bool canReadAtLastApplied(OperationContext* opCtx) {
+ // Local and available are the only ReadConcern levels that allow their ReadSource to be
+ // overridden to read at lastApplied. They read without a timestamp by default, but this check
+ // allows user secondary reads from conflicting with oplog batch application by reading at a
+ // consistent point in time.
+ // Internal operations use DBDirectClient as a loopback to perform local operations, and they
+ // expect the same level of consistency guarantees as any user operation. For that reason,
+ // DBDirectClient should be able to change the owning operation's ReadSource in order to serve
+ // consistent data.
const auto readConcernLevel = repl::ReadConcernArgs::get(opCtx).getLevel();
- if (readConcernLevel == repl::ReadConcernLevel::kLocalReadConcern ||
- readConcernLevel == repl::ReadConcernLevel::kAvailableReadConcern) {
+ if ((opCtx->getClient()->isFromUserConnection() || opCtx->getClient()->isInDirectClient()) &&
+ (readConcernLevel == repl::ReadConcernLevel::kLocalReadConcern ||
+ readConcernLevel == repl::ReadConcernLevel::kAvailableReadConcern)) {
return true;
}
-
return false;
}
+} // namespace
+namespace SnapshotHelper {
bool shouldReadAtLastApplied(OperationContext* opCtx,
const NamespaceString& nss,
std::string* reason) {
-
// If this is true, then the operation opted-in to the PBWM lock, implying that it cannot change
// its ReadSource. It's important to note that it is possible for this to be false, but still be
// holding the PBWM lock, explained below.
if (opCtx->lockState()->shouldConflictWithSecondaryBatchApplication()) {
- *reason = "conflicts with batch application";
+ if (reason) {
+ *reason = "conflicts with batch application";
+ }
return false;
}
@@ -71,16 +79,32 @@ bool shouldReadAtLastApplied(OperationContext* opCtx,
// guaranteed to observe all previous writes. This may occur when multiple collection locks are
// held concurrently, which is often the case when DBDirectClient is used.
if (opCtx->lockState()->isLockHeldForMode(resourceIdParallelBatchWriterMode, MODE_IS)) {
- *reason = "PBWM lock is held";
+ if (reason) {
+ *reason = "PBWM lock is held";
+ }
LOGV2_DEBUG(20577, 1, "not reading at lastApplied because the PBWM lock is held");
return false;
}
- // If we are in a replication state (like secondary or primary catch-up) where we are not
- // accepting writes, we should read at lastApplied. If this node can accept writes, then no
- // conflicting replication batches are being applied and we can read from the default snapshot.
+ // If this node can accept writes (i.e. primary), then no conflicting replication batches are
+ // being applied and we can read from the default snapshot. If we are in a replication state
+ // (like secondary or primary catch-up) where we are not accepting writes, we should read at
+ // lastApplied.
if (repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesForDatabase(opCtx, "admin")) {
- *reason = "primary";
+ if (reason) {
+ *reason = "primary";
+ }
+ return false;
+ }
+
+ // If we are not secondary, then we should not attempt to read at lastApplied because it may not
+ // be available or valid. Any operations reading outside of the primary or secondary states must
+ // be internal. We give these operations the benefit of the doubt rather than attempting to read
+ // at a lastApplied timestamp that is not valid.
+ if (!repl::ReplicationCoordinator::get(opCtx)->isInPrimaryOrSecondaryState(opCtx)) {
+ if (reason) {
+ *reason = "not primary or secondary";
+ }
return false;
}
@@ -88,7 +112,9 @@ bool shouldReadAtLastApplied(OperationContext* opCtx,
// written by the replication system. However, the oplog is special, as it *is* written by the
// replication system.
if (!nss.isReplicated() && !nss.isOplog()) {
- *reason = "unreplicated collection";
+ if (reason) {
+ *reason = "unreplicated collection";
+ }
return false;
}
@@ -96,15 +122,14 @@ bool shouldReadAtLastApplied(OperationContext* opCtx,
}
boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opCtx,
const NamespaceString& nss) {
- const bool canSwitch = canSwitchReadSource(opCtx);
- if (!canSwitch) {
+ if (!canReadAtLastApplied(opCtx)) {
return boost::none;
}
const auto existing = opCtx->recoveryUnit()->getTimestampReadSource();
std::string reason;
const bool readAtLastApplied = shouldReadAtLastApplied(opCtx, nss, &reason);
- if (existing == RecoveryUnit::ReadSource::kUnset) {
+ if (existing == RecoveryUnit::ReadSource::kNoTimestamp) {
// Shifting from reading without a timestamp to reading with a timestamp can be dangerous
// because writes will appear to vanish. This case is intended for new reads on secondaries
// and query yield recovery after state transitions from primary to secondary.
@@ -124,13 +149,14 @@ boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opC
if (!readAtLastApplied) {
LOGV2_DEBUG(4452902,
2,
- "Changing ReadSource to kUnset",
+ "Changing ReadSource to kNoTimestamp",
"collection"_attr = nss,
"reason"_attr = reason);
- // their ReadSources after performing reads at an un-timetamped snapshot. The only
- // exception is callers of this function that may need to change from kUnset to
- // kLastApplied in the event of a catalog conflict or query yield.
- return RecoveryUnit::ReadSource::kUnset;
+ // This shift to kNoTimestamp assumes that callers will not make future attempts to
+ // manipulate their ReadSources after performing reads at an un-timetamped snapshot. The
+ // only exception is callers of this function that may need to change from kNoTimestamp
+ // to kLastApplied in the event of a catalog conflict or query yield.
+ return RecoveryUnit::ReadSource::kNoTimestamp;
}
}
return boost::none;
diff --git a/src/mongo/db/storage/snapshot_helper.h b/src/mongo/db/storage/snapshot_helper.h
index fa8fdd85f24..c24dfd16d8c 100644
--- a/src/mongo/db/storage/snapshot_helper.h
+++ b/src/mongo/db/storage/snapshot_helper.h
@@ -37,6 +37,10 @@ namespace SnapshotHelper {
boost::optional<RecoveryUnit::ReadSource> getNewReadSource(OperationContext* opCtx,
const NamespaceString& nss);
+bool shouldReadAtLastApplied(OperationContext* opCtx,
+ const NamespaceString& nss,
+ std::string* reason = nullptr);
+
bool collectionChangesConflictWithRead(boost::optional<Timestamp> collectionMin,
boost::optional<Timestamp> readTimestamp);
} // namespace SnapshotHelper
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
index 7ddc76c2cf9..02d2986b736 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp
@@ -449,7 +449,6 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp()
// transaction to establish a read timestamp, but only for ReadSources that are expected to have
// read timestamps.
switch (_timestampReadSource) {
- case ReadSource::kUnset:
case ReadSource::kNoTimestamp:
return boost::none;
case ReadSource::kMajorityCommitted:
@@ -488,7 +487,6 @@ boost::optional<Timestamp> WiredTigerRecoveryUnit::getPointInTimeReadTimestamp()
return _readAtTimestamp;
// The follow ReadSources returned values in the first switch block.
- case ReadSource::kUnset:
case ReadSource::kNoTimestamp:
case ReadSource::kMajorityCommitted:
case ReadSource::kProvided:
@@ -511,7 +509,6 @@ void WiredTigerRecoveryUnit::_txnOpen() {
WT_SESSION* session = _session->getSession();
switch (_timestampReadSource) {
- case ReadSource::kUnset:
case ReadSource::kNoTimestamp: {
if (_isOplogReader) {
_oplogVisibleTs = static_cast<std::int64_t>(_oplogManager->getOplogReadTimestamp());
@@ -828,7 +825,6 @@ void WiredTigerRecoveryUnit::setTimestampReadSource(ReadSource readSource,
"setting timestamp read source",
"readSource"_attr = toString(readSource),
"provided"_attr = ((provided) ? provided->toString() : "none"));
-
invariant(!_isActive() || _timestampReadSource == readSource,
str::stream() << "Current state: " << toString(_getState())
<< ". Invalid internal state while setting timestamp read source: "
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
index c61d80205ac..b3961eacee0 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
@@ -244,7 +244,7 @@ private:
bool _isTimestamped = false;
// Specifies which external source to use when setting read timestamps on transactions.
- ReadSource _timestampReadSource = ReadSource::kUnset;
+ ReadSource _timestampReadSource = ReadSource::kNoTimestamp;
// Commits are assumed ordered. Unordered commits are assumed to always need to reserve a
// new optime, and thus always call oplogDiskLocRegister() on the record store.
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp
index 1f440d319b7..7171f75fbeb 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp
@@ -203,7 +203,8 @@ TEST_F(WiredTigerRecoveryUnitTestFixture, NoOverlapReadSource) {
}
// Read without a timestamp. The write should be visible.
- ASSERT_EQ(opCtx1->recoveryUnit()->getTimestampReadSource(), RecoveryUnit::ReadSource::kUnset);
+ ASSERT_EQ(opCtx1->recoveryUnit()->getTimestampReadSource(),
+ RecoveryUnit::ReadSource::kNoTimestamp);
RecordData unused;
ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused));
@@ -237,7 +238,7 @@ TEST_F(WiredTigerRecoveryUnitTestFixture, NoOverlapReadSource) {
// Read without a timestamp, and we should see the first and third records.
opCtx1->recoveryUnit()->abandonSnapshot();
- opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kUnset);
+ opCtx1->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoTimestamp);
ASSERT_TRUE(rs->findRecord(opCtx1, rid1, &unused));
ASSERT_FALSE(rs->findRecord(opCtx1, rid2, &unused));
ASSERT_TRUE(rs->findRecord(opCtx1, rid3, &unused));