diff options
-rw-r--r-- | src/mongo/db/repl/replication_consistency_markers_impl.cpp | 5 | ||||
-rw-r--r-- | src/mongo/db/repl/storage_interface.h | 11 | ||||
-rw-r--r-- | src/mongo/db/repl/storage_interface_impl.cpp | 45 | ||||
-rw-r--r-- | src/mongo/db/repl/storage_interface_impl.h | 3 | ||||
-rw-r--r-- | src/mongo/db/repl/storage_interface_mock.h | 5 |
5 files changed, 58 insertions, 11 deletions
diff --git a/src/mongo/db/repl/replication_consistency_markers_impl.cpp b/src/mongo/db/repl/replication_consistency_markers_impl.cpp index 47cf89bec52..df7fd8b19e8 100644 --- a/src/mongo/db/repl/replication_consistency_markers_impl.cpp +++ b/src/mongo/db/repl/replication_consistency_markers_impl.cpp @@ -506,8 +506,9 @@ ReplicationConsistencyMarkersImpl::refreshOplogTruncateAfterPointIfPrimary( // Fetch the oplog entry <= timestamp. all_durable may be set to a value between oplog entries. // We need an oplog entry in order to return term and wallclock time for an OpTimeAndWallTime // result. - auto truncateOplogEntryBSON = _storageInterface->findOplogEntryLessThanOrEqualToTimestamp( - opCtx, oplogRead.getCollection(), truncateTimestamp); + auto truncateOplogEntryBSON = + _storageInterface->findOplogEntryLessThanOrEqualToTimestampRetryOnWCE( + opCtx, oplogRead.getCollection(), truncateTimestamp); // The truncate point moves the Durable timestamp forward, so it should always exist in the // oplog. diff --git a/src/mongo/db/repl/storage_interface.h b/src/mongo/db/repl/storage_interface.h index 3f54e5abca1..0bb9bf0ea58 100644 --- a/src/mongo/db/repl/storage_interface.h +++ b/src/mongo/db/repl/storage_interface.h @@ -320,6 +320,17 @@ public: OperationContext* opCtx, Collection* oplog, const Timestamp& timestamp) = 0; /** + * Calls findOplogEntryLessThanOrEqualToTimestamp with endless WriteConflictException retries. + * Other errors get thrown. Concurrent oplog reads with the validate cmd on the same collection + * may throw WCEs. Obeys opCtx interrupts. + * + * Call this function instead of findOplogEntryLessThanOrEqualToTimestamp if the caller cannot + * fail, say for correctness. + */ + virtual boost::optional<BSONObj> findOplogEntryLessThanOrEqualToTimestampRetryOnWCE( + OperationContext* opCtx, Collection* oplog, const Timestamp& timestamp) = 0; + + /** * Fetches the latest oplog entry's timestamp. Bypasses the oplog visibility rules. */ virtual Timestamp getLatestOplogTimestamp(OperationContext* opCtx) = 0; diff --git a/src/mongo/db/repl/storage_interface_impl.cpp b/src/mongo/db/repl/storage_interface_impl.cpp index e9b5f68d3d6..0b0e2e4e065 100644 --- a/src/mongo/db/repl/storage_interface_impl.cpp +++ b/src/mongo/db/repl/storage_interface_impl.cpp @@ -1048,19 +1048,11 @@ boost::optional<BSONObj> StorageInterfaceImpl::findOplogEntryLessThanOrEqualToTi invariant(oplog); invariant(opCtx->lockState()->isLocked()); - // Using a YieldPolicy WRITE_CONFLICT_RETRY_ONLY that will allow query to retry on - // WriteConflictExceptions without releasing locks that are important to callers. - // - // This read can run concurrently with the validate cmd's WT verify operation due to the special - // locking rules for internal operations accessing the oplog collection. Validate holds a MODE_X - // collection lock for WT verify, but an internal read only needs a MODE_IS global lock. Trying - // to open a cursor on a collection that has a verify operation running produces an EBUSY error - // that we then convert to a WCE. std::unique_ptr<PlanExecutor, PlanExecutor::Deleter> exec = InternalPlanner::collectionScan(opCtx, NamespaceString::kRsOplogNamespace.ns(), oplog, - PlanExecutor::WRITE_CONFLICT_RETRY_ONLY, + PlanExecutor::NO_YIELD, InternalPlanner::BACKWARD); // A record id in the oplog collection is equivalent to the document's timestamp field. @@ -1082,6 +1074,41 @@ boost::optional<BSONObj> StorageInterfaceImpl::findOplogEntryLessThanOrEqualToTi return boost::none; } +boost::optional<BSONObj> StorageInterfaceImpl::findOplogEntryLessThanOrEqualToTimestampRetryOnWCE( + OperationContext* opCtx, Collection* oplogCollection, const Timestamp& timestamp) { + // Oplog reads are specially done under only MODE_IS global locks, without database or + // collection level intent locks. Therefore, reads can run concurrently with validate cmds that + // take collection MODE_X locks. Validate with {full:true} set calls WT::verify on the + // collection, which causes concurrent readers to hit WT EBUSY errors that MongoDB converts + // into WriteConflictException errors. + // + // Consequently, this code must be resilient to WCE errors and retry until the validate cmd + // finishes. The greater operation using this helper cannot simply fail because it would cause + // correctness errors. + + int retries = 0; + while (true) { + try { + return findOplogEntryLessThanOrEqualToTimestamp(opCtx, oplogCollection, timestamp); + } catch (const WriteConflictException&) { + // This will log a message about the conflict initially and then every 5 seconds, with + // the current rather arbitrary settings. + if (retries % 10 == 0) { + LOGV2(47959000, + "Reading the oplog collection conflicts with a validate cmd. Continuing to " + "retry.", + "retries"_attr = retries); + } + + ++retries; + + // Sleep a bit so we do not keep hammering the system with retries while the validate + // cmd finishes. + opCtx->sleepFor(Milliseconds(500)); + } + } +} + Timestamp StorageInterfaceImpl::getLatestOplogTimestamp(OperationContext* opCtx) { auto statusWithTimestamp = [&]() { AutoGetOplog oplogRead(opCtx, OplogAccessMode::kRead); diff --git a/src/mongo/db/repl/storage_interface_impl.h b/src/mongo/db/repl/storage_interface_impl.h index ba7698f30a6..6d6152369f3 100644 --- a/src/mongo/db/repl/storage_interface_impl.h +++ b/src/mongo/db/repl/storage_interface_impl.h @@ -145,6 +145,9 @@ public: boost::optional<BSONObj> findOplogEntryLessThanOrEqualToTimestamp( OperationContext* opCtx, Collection* oplog, const Timestamp& timestamp) override; + boost::optional<BSONObj> findOplogEntryLessThanOrEqualToTimestampRetryOnWCE( + OperationContext* opCtx, Collection* oplog, const Timestamp& timestamp) override; + Timestamp getLatestOplogTimestamp(OperationContext* opCtx) override; StatusWith<StorageInterface::CollectionSize> getCollectionSize( diff --git a/src/mongo/db/repl/storage_interface_mock.h b/src/mongo/db/repl/storage_interface_mock.h index 9f610ded093..eed01a324af 100644 --- a/src/mongo/db/repl/storage_interface_mock.h +++ b/src/mongo/db/repl/storage_interface_mock.h @@ -267,6 +267,11 @@ public: return boost::none; } + boost::optional<BSONObj> findOplogEntryLessThanOrEqualToTimestampRetryOnWCE( + OperationContext* opCtx, Collection* oplog, const Timestamp& timestamp) override { + return boost::none; + } + Timestamp getLatestOplogTimestamp(OperationContext* opCtx) override { return Timestamp(); } |