summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/mongo/db/repl/replication_consistency_markers_impl.cpp5
-rw-r--r--src/mongo/db/repl/storage_interface.h11
-rw-r--r--src/mongo/db/repl/storage_interface_impl.cpp45
-rw-r--r--src/mongo/db/repl/storage_interface_impl.h3
-rw-r--r--src/mongo/db/repl/storage_interface_mock.h5
5 files changed, 58 insertions, 11 deletions
diff --git a/src/mongo/db/repl/replication_consistency_markers_impl.cpp b/src/mongo/db/repl/replication_consistency_markers_impl.cpp
index 47cf89bec52..df7fd8b19e8 100644
--- a/src/mongo/db/repl/replication_consistency_markers_impl.cpp
+++ b/src/mongo/db/repl/replication_consistency_markers_impl.cpp
@@ -506,8 +506,9 @@ ReplicationConsistencyMarkersImpl::refreshOplogTruncateAfterPointIfPrimary(
// Fetch the oplog entry <= timestamp. all_durable may be set to a value between oplog entries.
// We need an oplog entry in order to return term and wallclock time for an OpTimeAndWallTime
// result.
- auto truncateOplogEntryBSON = _storageInterface->findOplogEntryLessThanOrEqualToTimestamp(
- opCtx, oplogRead.getCollection(), truncateTimestamp);
+ auto truncateOplogEntryBSON =
+ _storageInterface->findOplogEntryLessThanOrEqualToTimestampRetryOnWCE(
+ opCtx, oplogRead.getCollection(), truncateTimestamp);
// The truncate point moves the Durable timestamp forward, so it should always exist in the
// oplog.
diff --git a/src/mongo/db/repl/storage_interface.h b/src/mongo/db/repl/storage_interface.h
index 3f54e5abca1..0bb9bf0ea58 100644
--- a/src/mongo/db/repl/storage_interface.h
+++ b/src/mongo/db/repl/storage_interface.h
@@ -320,6 +320,17 @@ public:
OperationContext* opCtx, Collection* oplog, const Timestamp& timestamp) = 0;
/**
+ * Calls findOplogEntryLessThanOrEqualToTimestamp with endless WriteConflictException retries.
+ * Other errors get thrown. Concurrent oplog reads with the validate cmd on the same collection
+ * may throw WCEs. Obeys opCtx interrupts.
+ *
+ * Call this function instead of findOplogEntryLessThanOrEqualToTimestamp if the caller cannot
+ * fail, say for correctness.
+ */
+ virtual boost::optional<BSONObj> findOplogEntryLessThanOrEqualToTimestampRetryOnWCE(
+ OperationContext* opCtx, Collection* oplog, const Timestamp& timestamp) = 0;
+
+ /**
* Fetches the latest oplog entry's timestamp. Bypasses the oplog visibility rules.
*/
virtual Timestamp getLatestOplogTimestamp(OperationContext* opCtx) = 0;
diff --git a/src/mongo/db/repl/storage_interface_impl.cpp b/src/mongo/db/repl/storage_interface_impl.cpp
index e9b5f68d3d6..0b0e2e4e065 100644
--- a/src/mongo/db/repl/storage_interface_impl.cpp
+++ b/src/mongo/db/repl/storage_interface_impl.cpp
@@ -1048,19 +1048,11 @@ boost::optional<BSONObj> StorageInterfaceImpl::findOplogEntryLessThanOrEqualToTi
invariant(oplog);
invariant(opCtx->lockState()->isLocked());
- // Using a YieldPolicy WRITE_CONFLICT_RETRY_ONLY that will allow query to retry on
- // WriteConflictExceptions without releasing locks that are important to callers.
- //
- // This read can run concurrently with the validate cmd's WT verify operation due to the special
- // locking rules for internal operations accessing the oplog collection. Validate holds a MODE_X
- // collection lock for WT verify, but an internal read only needs a MODE_IS global lock. Trying
- // to open a cursor on a collection that has a verify operation running produces an EBUSY error
- // that we then convert to a WCE.
std::unique_ptr<PlanExecutor, PlanExecutor::Deleter> exec =
InternalPlanner::collectionScan(opCtx,
NamespaceString::kRsOplogNamespace.ns(),
oplog,
- PlanExecutor::WRITE_CONFLICT_RETRY_ONLY,
+ PlanExecutor::NO_YIELD,
InternalPlanner::BACKWARD);
// A record id in the oplog collection is equivalent to the document's timestamp field.
@@ -1082,6 +1074,41 @@ boost::optional<BSONObj> StorageInterfaceImpl::findOplogEntryLessThanOrEqualToTi
return boost::none;
}
+boost::optional<BSONObj> StorageInterfaceImpl::findOplogEntryLessThanOrEqualToTimestampRetryOnWCE(
+ OperationContext* opCtx, Collection* oplogCollection, const Timestamp& timestamp) {
+ // Oplog reads are specially done under only MODE_IS global locks, without database or
+ // collection level intent locks. Therefore, reads can run concurrently with validate cmds that
+ // take collection MODE_X locks. Validate with {full:true} set calls WT::verify on the
+ // collection, which causes concurrent readers to hit WT EBUSY errors that MongoDB converts
+ // into WriteConflictException errors.
+ //
+ // Consequently, this code must be resilient to WCE errors and retry until the validate cmd
+ // finishes. The greater operation using this helper cannot simply fail because it would cause
+ // correctness errors.
+
+ int retries = 0;
+ while (true) {
+ try {
+ return findOplogEntryLessThanOrEqualToTimestamp(opCtx, oplogCollection, timestamp);
+ } catch (const WriteConflictException&) {
+ // This will log a message about the conflict initially and then every 5 seconds, with
+ // the current rather arbitrary settings.
+ if (retries % 10 == 0) {
+ LOGV2(47959000,
+ "Reading the oplog collection conflicts with a validate cmd. Continuing to "
+ "retry.",
+ "retries"_attr = retries);
+ }
+
+ ++retries;
+
+ // Sleep a bit so we do not keep hammering the system with retries while the validate
+ // cmd finishes.
+ opCtx->sleepFor(Milliseconds(500));
+ }
+ }
+}
+
Timestamp StorageInterfaceImpl::getLatestOplogTimestamp(OperationContext* opCtx) {
auto statusWithTimestamp = [&]() {
AutoGetOplog oplogRead(opCtx, OplogAccessMode::kRead);
diff --git a/src/mongo/db/repl/storage_interface_impl.h b/src/mongo/db/repl/storage_interface_impl.h
index ba7698f30a6..6d6152369f3 100644
--- a/src/mongo/db/repl/storage_interface_impl.h
+++ b/src/mongo/db/repl/storage_interface_impl.h
@@ -145,6 +145,9 @@ public:
boost::optional<BSONObj> findOplogEntryLessThanOrEqualToTimestamp(
OperationContext* opCtx, Collection* oplog, const Timestamp& timestamp) override;
+ boost::optional<BSONObj> findOplogEntryLessThanOrEqualToTimestampRetryOnWCE(
+ OperationContext* opCtx, Collection* oplog, const Timestamp& timestamp) override;
+
Timestamp getLatestOplogTimestamp(OperationContext* opCtx) override;
StatusWith<StorageInterface::CollectionSize> getCollectionSize(
diff --git a/src/mongo/db/repl/storage_interface_mock.h b/src/mongo/db/repl/storage_interface_mock.h
index 9f610ded093..eed01a324af 100644
--- a/src/mongo/db/repl/storage_interface_mock.h
+++ b/src/mongo/db/repl/storage_interface_mock.h
@@ -267,6 +267,11 @@ public:
return boost::none;
}
+ boost::optional<BSONObj> findOplogEntryLessThanOrEqualToTimestampRetryOnWCE(
+ OperationContext* opCtx, Collection* oplog, const Timestamp& timestamp) override {
+ return boost::none;
+ }
+
Timestamp getLatestOplogTimestamp(OperationContext* opCtx) override {
return Timestamp();
}