summaryrefslogtreecommitdiff
path: root/src/mongo/db/write_concern.cpp
diff options
context:
space:
mode:
authorDianna Hohensee <dianna.hohensee@mongodb.com>2020-02-27 08:46:40 -0500
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-02-27 14:38:40 +0000
commit454b3e9f8418bb9b0134869af582a0518aad5e12 (patch)
treea86801e6f799ca0a8a07a69d798e6522fb6c3666 /src/mongo/db/write_concern.cpp
parent4e20d92959e36a27ebedac34dfd313c4b1295364 (diff)
downloadmongo-454b3e9f8418bb9b0134869af582a0518aad5e12.tar.gz
SERVER-46186 Single voter replica set primaries for {j: true} writeConcern writes will wait for earlier writes to become visible before flushing the journal to disk to ensure user confirmed writes are never truncated during recovery after a crash.
Diffstat (limited to 'src/mongo/db/write_concern.cpp')
-rw-r--r--src/mongo/db/write_concern.cpp29
1 files changed, 29 insertions, 0 deletions
diff --git a/src/mongo/db/write_concern.cpp b/src/mongo/db/write_concern.cpp
index 85262273fc6..d72aa98df12 100644
--- a/src/mongo/db/write_concern.cpp
+++ b/src/mongo/db/write_concern.cpp
@@ -41,6 +41,7 @@
#include "mongo/db/read_write_concern_defaults.h"
#include "mongo/db/repl/optime.h"
#include "mongo/db/repl/replication_coordinator.h"
+#include "mongo/db/repl/storage_interface.h"
#include "mongo/db/server_options.h"
#include "mongo/db/service_context.h"
#include "mongo/db/stats/timer_stats.h"
@@ -244,6 +245,32 @@ void WriteConcernResult::appendTo(BSONObjBuilder* result) const {
result->append("err", err);
}
+/**
+ * Write concern with {j: true} on single voter replica set primaries must wait for no oplog holes
+ * behind a write, before flushing to disk (not done in this function), in order to guarantee that
+ * a write will remain after unclean shutdown and server restart recovery.
+ *
+ * Multi-voter replica sets will likely roll back writes if the primary crashes and restarts.
+ * However, single voter sets never roll back writes, so we must maintain that behavior. Multi-node
+ * single-voter primaries must truncate the oplog to ensure cross-replica set data consistency; and
+ * single-node single-voter sets must never lose confirmed writes.
+ *
+ * The oplogTruncateAfterPoint is updated with the no holes point prior to journal flushing (write
+ * persistence). Ensuring the no holes point is past (or equal to) our write, ensures the flush to
+ * disk will save a truncate point that will not truncate the new write we wish to guarantee.
+ *
+ * Can throw on opCtx interruption.
+ */
+void waitForNoOplogHolesIfNeeded(OperationContext* opCtx) {
+ auto const replCoord = repl::ReplicationCoordinator::get(opCtx);
+ if (replCoord->getConfig().votingMembers().size() == 1) {
+ // It is safe for secondaries in multi-node single voter replica sets to truncate writes if
+ // there are oplog holes. They can catch up again.
+ repl::StorageInterface::get(opCtx)->waitForAllEarlierOplogWritesToBeVisible(
+ opCtx, /*primaryOnly*/ true);
+ }
+}
+
Status waitForWriteConcern(OperationContext* opCtx,
const OpTime& replOpTime,
const WriteConcernOptions& writeConcern,
@@ -277,6 +304,7 @@ Status waitForWriteConcern(OperationContext* opCtx,
case WriteConcernOptions::SyncMode::NONE:
break;
case WriteConcernOptions::SyncMode::FSYNC: {
+ waitForNoOplogHolesIfNeeded(opCtx);
if (!storageEngine->isDurable()) {
storageEngine->flushAllFiles(opCtx, /*callerHoldsReadLock*/ false);
@@ -290,6 +318,7 @@ Status waitForWriteConcern(OperationContext* opCtx,
break;
}
case WriteConcernOptions::SyncMode::JOURNAL:
+ waitForNoOplogHolesIfNeeded(opCtx);
storageEngine->waitForJournalFlush(opCtx);
break;
}