summaryrefslogtreecommitdiff
path: root/src/mongo/db/read_concern_mongod.cpp
diff options
context:
space:
mode:
authorWilliam Schultz <william.schultz@mongodb.com>2019-03-15 14:07:58 -0400
committerWilliam Schultz <william.schultz@mongodb.com>2019-03-15 14:07:58 -0400
commitc83b8d8aab53f7545851a76425b2f2cd7c598cbd (patch)
tree3a43d6425a1c578c866f6c24b62c0d52a794da0b /src/mongo/db/read_concern_mongod.cpp
parentc346f2db39f1ddae16ecf8643041e19a31a83d84 (diff)
downloadmongo-c83b8d8aab53f7545851a76425b2f2cd7c598cbd.tar.gz
SERVER-39356 Make speculative majority change stream reads utilize the `kNoOverlap` timestamp read source
Speculative majority change streams provide "majority" read guarantees by reading from a local snapshot of data and then waiting for that data to become majority committed, instead of reading directly from a majority committed snapshot. In order to satisfy this guarantee a speculative majority read must wait for the proper timestamp to become majority committed after reading data. If the newest data it read reflects a timestamp T, then it must wait for a timestamp >= T to become majority committed. In general, waiting on replication's lastApplied timestamp is not safe, since it is possible for writes to be visible to readers even if those writes have not yet advanced the in-memory value of lastApplied. To work around this issue for speculative majority reads, we instead choose to read from an explicitly chosen timestamp in the storage engine, and then wait on that timestamp to majority commit. This gives us a more direct way to know what timestamp the data we read reflects. We utilize the `kNoOverlap` read source, which allows us to read from the min(lastApplied, all_committed), which is a convenient way to make these reads work correctly on both primaries and secondaries.
Diffstat (limited to 'src/mongo/db/read_concern_mongod.cpp')
-rw-r--r--src/mongo/db/read_concern_mongod.cpp21
1 files changed, 13 insertions, 8 deletions
diff --git a/src/mongo/db/read_concern_mongod.cpp b/src/mongo/db/read_concern_mongod.cpp
index f3cb33c24dd..c961956ec48 100644
--- a/src/mongo/db/read_concern_mongod.cpp
+++ b/src/mongo/db/read_concern_mongod.cpp
@@ -38,6 +38,7 @@
#include "mongo/db/op_observer.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/read_concern_mongod_gen.h"
+#include "mongo/db/repl/optime.h"
#include "mongo/db/repl/repl_client_info.h"
#include "mongo/db/repl/speculative_majority_read_info.h"
#include "mongo/db/s/sharding_state.h"
@@ -300,9 +301,11 @@ MONGO_REGISTER_SHIM(waitForReadConcern)
// Handle speculative majority reads.
if (readConcernArgs.getMajorityReadMechanism() ==
repl::ReadConcernArgs::MajorityReadMechanism::kSpeculative) {
- // We read from a local snapshot, so there is no need to set an explicit read source.
- // Mark down that we need to block after the command is done to satisfy majority read
- // concern, though.
+ // For speculative majority reads, we utilize the "no overlap" read source as a means of
+ // always reading at the minimum of the all-committed and lastApplied timestamps. This
+ // allows for safe behavior on both primaries and secondaries, where the behavior of the
+ // all-committed and lastApplied timestamps differ significantly.
+ opCtx->recoveryUnit()->setTimestampReadSource(RecoveryUnit::ReadSource::kNoOverlap);
auto& speculativeReadInfo = repl::SpeculativeMajorityReadInfo::get(opCtx);
speculativeReadInfo.setIsSpeculativeRead();
return Status::OK();
@@ -390,17 +393,19 @@ MONGO_REGISTER_SHIM(waitForSpeculativeMajorityReadConcern)
invariant(speculativeReadInfo.isSpeculativeRead());
// Select the timestamp to wait on. A command may have selected a specific timestamp to wait on.
- // If not, then we just wait on the most recent timestamp written on this node i.e. lastApplied.
+ // If not, then we use the timestamp selected by the read source.
auto replCoord = repl::ReplicationCoordinator::get(opCtx);
Timestamp waitTs;
- auto lastAppliedTs = replCoord->getMyLastAppliedOpTime().getTimestamp();
auto speculativeReadTimestamp = speculativeReadInfo.getSpeculativeReadTimestamp();
if (speculativeReadTimestamp) {
- // The timestamp provided must not be greater than the current lastApplied.
- invariant(*speculativeReadTimestamp <= lastAppliedTs);
waitTs = *speculativeReadTimestamp;
} else {
- waitTs = lastAppliedTs;
+ // Speculative majority reads are required to use the 'kNoOverlap' read source.
+ invariant(opCtx->recoveryUnit()->getTimestampReadSource() ==
+ RecoveryUnit::ReadSource::kNoOverlap);
+ boost::optional<Timestamp> readTs = opCtx->recoveryUnit()->getPointInTimeReadTimestamp();
+ invariant(readTs);
+ waitTs = *readTs;
}
// Block to make sure returned data is majority committed.