summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJosef Ahmad <josef.ahmad@mongodb.com>2022-01-13 11:21:43 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-01-13 12:12:05 +0000
commit5a1dbe698bf88ef3f0ce2348c705d6a042c32011 (patch)
tree770e9b2da7fdb55d4b204d69b7b72334e8ccccc8
parent546d77f0945fabc0fc485e762bd7548871ffc601 (diff)
downloadmongo-5a1dbe698bf88ef3f0ce2348c705d6a042c32011.tar.gz
SERVER-61709 Introduce implicitly replicated namespaces
Implicitly replicated namespaces are internal namespaces that do not replicate writes, with the exception of deletions, user-initiated direct writes and some maintenance operations. This patch lists config.system.preimages, config.images_collection, config.transactions and config.changes.* as implicitly replicated namespaces, and unifies and validates their semantics. It also special-cases some of the config.transactions replication behaviour that is too specific to be generalised.
-rw-r--r--jstests/core/write_change_stream_pit_preimage.js5
-rw-r--r--jstests/noPassthrough/change_streams_pre_image_removal_job.js10
-rw-r--r--jstests/noPassthrough/store_retryable_find_and_modify_images_in_side_collection.js4
-rw-r--r--jstests/sharding/internal_sessions_reaping.js8
-rw-r--r--src/mongo/db/commands/set_feature_compatibility_version_command.cpp3
-rw-r--r--src/mongo/db/exec/update_stage.cpp12
-rw-r--r--src/mongo/db/exec/upsert_stage.cpp9
-rw-r--r--src/mongo/db/namespace_string.cpp16
-rw-r--r--src/mongo/db/namespace_string.h17
-rw-r--r--src/mongo/db/op_observer_impl.cpp1
-rw-r--r--src/mongo/db/op_observer_impl_test.cpp1
-rw-r--r--src/mongo/db/pipeline/change_stream_pre_image_helpers.cpp1
-rw-r--r--src/mongo/db/repl/dbcheck.cpp3
-rw-r--r--src/mongo/db/repl/oplog.cpp1
-rw-r--r--src/mongo/db/session_catalog.h2
15 files changed, 86 insertions, 7 deletions
diff --git a/jstests/core/write_change_stream_pit_preimage.js b/jstests/core/write_change_stream_pit_preimage.js
index baccdbd3109..4c463899515 100644
--- a/jstests/core/write_change_stream_pit_preimage.js
+++ b/jstests/core/write_change_stream_pit_preimage.js
@@ -48,6 +48,11 @@ function assertPreImagesWrittenForOps(db, ops, expectedPreImages) {
assert.eq(writtenPreImages[idx].preImage, expectedPreImages[idx]);
assertValidChangeStreamPreImageDocument(writtenPreImages[idx]);
}
+
+ // Because the pre-images collection is implicitly replicated, validate that writes do not
+ // generate oplog entries, with the exception of deletions.
+ assert.eq(0,
+ localDB.oplog.rs.find({op: {'$ne': 'd'}, ns: 'config.system.preimages'}).itcount());
}
// Validates that no pre-image is written while performing ops.
diff --git a/jstests/noPassthrough/change_streams_pre_image_removal_job.js b/jstests/noPassthrough/change_streams_pre_image_removal_job.js
index cb9ba2e8c17..6d7ca7253ee 100644
--- a/jstests/noPassthrough/change_streams_pre_image_removal_job.js
+++ b/jstests/noPassthrough/change_streams_pre_image_removal_job.js
@@ -41,6 +41,7 @@ rst.startSet({setParameter: {expiredChangeStreamPreImageRemovalJobSleepSecs: 1}}
rst.initiate();
const primaryNode = rst.getPrimary();
const testDB = primaryNode.getDB(jsTestName());
+const localDB = primaryNode.getDB("local");
const collA =
assertCreateCollection(testDB, "collA", {changeStreamPreAndPostImages: {enabled: true}});
const collB =
@@ -59,7 +60,8 @@ for (const coll of [collA, collB]) {
// Pre-images collection should contain four pre-images.
preImages = getPreImages(primaryNode);
-assert.eq(preImages.length, 4, preImages);
+const preImagesToExpire = 4;
+assert.eq(preImages.length, preImagesToExpire, preImages);
// Roll over all current oplog entries.
const lastOplogEntryToBeRemoved = getLatestOp(primaryNode);
@@ -96,6 +98,12 @@ assert.soon(() => {
return onlyTwoPreImagesLeft && allPreImagesHaveBiggerTimestamp;
});
+// Because the pre-images collection is implicitly replicated, validate that writes do not generate
+// oplog entries, with the exception of deletions.
+const preimagesNs = 'config.system.preimages';
+assert.eq(preImagesToExpire, localDB.oplog.rs.find({op: 'd', ns: preimagesNs}).itcount());
+assert.eq(0, localDB.oplog.rs.find({op: {'$ne': 'd'}, ns: preimagesNs}).itcount());
+
// Verify that pre-images collection content on the primary node is the same as on the
// secondary.
rst.awaitReplication();
diff --git a/jstests/noPassthrough/store_retryable_find_and_modify_images_in_side_collection.js b/jstests/noPassthrough/store_retryable_find_and_modify_images_in_side_collection.js
index 67bb79af9ae..a54b2cf0fb0 100644
--- a/jstests/noPassthrough/store_retryable_find_and_modify_images_in_side_collection.js
+++ b/jstests/noPassthrough/store_retryable_find_and_modify_images_in_side_collection.js
@@ -336,6 +336,10 @@ function runTests(lsid,
retryRes = assert.commandWorked(mainConn.getDB('test').runCommand(cmd));
assertRetryCommand(res, retryRes);
+ // Because the config.image_collection table is implicitly replicated, validate that writes do
+ // not generate oplog entries, with the exception of deletions.
+ assert.eq(0, oplog.find({ns: "config.image_collection", op: {'$ne': 'd'}}).itcount());
+
assert(mainConn.getDB('test').user.drop());
}
diff --git a/jstests/sharding/internal_sessions_reaping.js b/jstests/sharding/internal_sessions_reaping.js
index 4da85811378..26267ffbd78 100644
--- a/jstests/sharding/internal_sessions_reaping.js
+++ b/jstests/sharding/internal_sessions_reaping.js
@@ -33,10 +33,12 @@ const kCollName = "testColl";
const kConfigSessionsNs = "config.system.sessions";
const kConfigTxnsNs = "config.transactions";
const kImageCollNs = "config.image_collection";
+const kOplogCollNs = "local.oplog.rs";
let sessionsCollOnPrimary = shard0Primary.getCollection(kConfigSessionsNs);
let transactionsCollOnPrimary = shard0Primary.getCollection(kConfigTxnsNs);
let imageCollOnPrimary = shard0Primary.getCollection(kImageCollNs);
+let oplogCollOnPrimary = shard0Primary.getCollection(kOplogCollNs);
let testDB = shard0Primary.getDB(kDbName);
assert.commandWorked(testDB.createCollection(kCollName));
@@ -181,5 +183,11 @@ assert.eq(0,
tojson(transactionsCollOnPrimary.find().toArray()));
assert.eq(0, imageCollOnPrimary.find().itcount());
+// Because the config.transactions is implicitly replicated, validate that writes do not generate
+// oplog entries, with the exception of deletions.
+assert.eq(numTransactionsCollEntries,
+ oplogCollOnPrimary.find({op: 'd', ns: kConfigTxnsNs}).itcount());
+assert.eq(0, oplogCollOnPrimary.find({op: {'$ne': 'd'}, ns: kConfigTxnsNs}).itcount());
+
st.stop();
})();
diff --git a/src/mongo/db/commands/set_feature_compatibility_version_command.cpp b/src/mongo/db/commands/set_feature_compatibility_version_command.cpp
index d7d6d9f6d8e..a1aa4f78199 100644
--- a/src/mongo/db/commands/set_feature_compatibility_version_command.cpp
+++ b/src/mongo/db/commands/set_feature_compatibility_version_command.cpp
@@ -711,7 +711,8 @@ private:
// Due to the possibility that the shell or drivers have implicit sessions enabled, we
// cannot write to the config.transactions collection while we're in a session. So we
// construct a temporary client to as a work around.
- auto newClient = opCtx->getServiceContext()->makeClient("InternalSessionsCleanup");
+ auto newClient = opCtx->getServiceContext()->makeClient(
+ SessionCatalog::kInternalSessionsCleanupClient.toString());
{
stdx::lock_guard<Client> lk(*newClient.get());
diff --git a/src/mongo/db/exec/update_stage.cpp b/src/mongo/db/exec/update_stage.cpp
index bda0a81078b..d6e53a345ef 100644
--- a/src/mongo/db/exec/update_stage.cpp
+++ b/src/mongo/db/exec/update_stage.cpp
@@ -52,6 +52,7 @@
#include "mongo/db/s/sharding_state.h"
#include "mongo/db/s/sharding_write_router.h"
#include "mongo/db/service_context.h"
+#include "mongo/db/session_catalog.h"
#include "mongo/db/storage/duplicate_key_error_info.h"
#include "mongo/db/update/path_support.h"
#include "mongo/db/update/storage_validation.h"
@@ -372,6 +373,17 @@ PlanStage::StageState UpdateStage::doWork(WorkingSetID* out) {
return PlanStage::IS_EOF;
}
+ boost::optional<repl::UnreplicatedWritesBlock> unReplBlock;
+ const auto isSessionCleanupClient =
+ opCtx()->getClient()->desc() == SessionCatalog::kInternalSessionsCleanupClient;
+ if (collection()->ns().isImplicitlyReplicated() && !_isUserInitiatedWrite &&
+ !isSessionCleanupClient) {
+ // Implictly replicated collections do not replicate updates.
+ // However, user-initiated writes and some background maintenance tasks are allowed
+ // to replicate as they cannot be derived from the oplog.
+ unReplBlock.emplace(opCtx());
+ }
+
// It is possible that after an update was applied, a WriteConflictException
// occurred and prevented us from returning ADVANCED with the requested version
// of the document.
diff --git a/src/mongo/db/exec/upsert_stage.cpp b/src/mongo/db/exec/upsert_stage.cpp
index a6f6bf73d04..f1b684a544d 100644
--- a/src/mongo/db/exec/upsert_stage.cpp
+++ b/src/mongo/db/exec/upsert_stage.cpp
@@ -35,6 +35,7 @@
#include "mongo/db/curop_failpoint_helpers.h"
#include "mongo/db/query/query_feature_flags_gen.h"
#include "mongo/db/s/operation_sharding_state.h"
+#include "mongo/db/session_catalog.h"
#include "mongo/db/update/storage_validation.h"
#include "mongo/s/would_change_owning_shard_exception.h"
@@ -72,6 +73,14 @@ PlanStage::StageState UpsertStage::doWork(WorkingSetID* out) {
return StageState::IS_EOF;
}
+ boost::optional<repl::UnreplicatedWritesBlock> unReplBlock;
+ const auto isSessionCleanupClient =
+ opCtx()->getClient()->desc() == SessionCatalog::kInternalSessionsCleanupClient;
+ if (collection()->ns().isImplicitlyReplicated() && !isSessionCleanupClient) {
+ // Implictly replicated collections do not replicate updates.
+ unReplBlock.emplace(opCtx());
+ }
+
// First, attempt to perform the update on a matching document.
auto updateState = UpdateStage::doWork(out);
diff --git a/src/mongo/db/namespace_string.cpp b/src/mongo/db/namespace_string.cpp
index 631177c4d63..f1e258ba2ce 100644
--- a/src/mongo/db/namespace_string.cpp
+++ b/src/mongo/db/namespace_string.cpp
@@ -347,6 +347,10 @@ bool NamespaceString::isConfigImagesCollection() const {
return ns() == kConfigImagesNamespace.ns();
}
+bool NamespaceString::isConfigTransactionsCollection() const {
+ return ns() == kSessionTransactionsTableNamespace.ns();
+}
+
NamespaceString NamespaceString::makeTimeseriesBucketsNamespace() const {
return {db(), kTimeseriesBucketsCollectionPrefix.toString() + coll()};
}
@@ -356,6 +360,18 @@ NamespaceString NamespaceString::getTimeseriesViewNamespace() const {
return {db(), coll().substr(kTimeseriesBucketsCollectionPrefix.size())};
}
+bool NamespaceString::isImplicitlyReplicated() const {
+ if (isChangeStreamPreImagesCollection() || isConfigImagesCollection() ||
+ isConfigTransactionsCollection() || isChangeCollection()) {
+ // Implicitly replicated namespaces are replicated, although they only replicate a subset of
+ // writes.
+ invariant(isReplicated());
+ return true;
+ }
+
+ return false;
+}
+
bool NamespaceString::isReplicated() const {
if (isLocal()) {
return false;
diff --git a/src/mongo/db/namespace_string.h b/src/mongo/db/namespace_string.h
index 17ea8855bc7..339c62ce23f 100644
--- a/src/mongo/db/namespace_string.h
+++ b/src/mongo/db/namespace_string.h
@@ -307,6 +307,9 @@ public:
bool isSystemDotProfile() const {
return coll() == "system.profile";
}
+ bool isChangeCollection() const {
+ return (db() == kConfigDb) && coll().startsWith("changes.");
+ }
bool isSystemDotViews() const {
return coll() == kSystemDotViewsCollectionName;
}
@@ -384,6 +387,11 @@ public:
bool isConfigImagesCollection() const;
/**
+ * Returns whether the specified namespace is config.transactions.
+ */
+ bool isConfigTransactionsCollection() const;
+
+ /**
* Returns the time-series buckets namespace for this view.
*/
NamespaceString makeTimeseriesBucketsNamespace() const;
@@ -394,6 +402,15 @@ public:
NamespaceString getTimeseriesViewNamespace() const;
/**
+ * Returns whether the namespace is implicitly replicated, based only on its string value.
+ *
+ * An implicitly replicated namespace is an internal namespace which does not replicate writes
+ * via the oplog, with the exception of deletions. Writes are not replicated as an optimization
+ * because their content can be reliably derived from entries in the oplog.
+ */
+ bool isImplicitlyReplicated() const;
+
+ /**
* Returns whether a namespace is replicated, based only on its string value. One notable
* omission is that map reduce `tmp.mr` collections may or may not be replicated. Callers must
* decide how to handle that case separately.
diff --git a/src/mongo/db/op_observer_impl.cpp b/src/mongo/db/op_observer_impl.cpp
index 398c577df06..51bf478673f 100644
--- a/src/mongo/db/op_observer_impl.cpp
+++ b/src/mongo/db/op_observer_impl.cpp
@@ -297,7 +297,6 @@ void writeToImageCollection(OperationContext* opCtx,
imageEntry.setImageKind(imageKind);
imageEntry.setImage(dataImage);
- repl::UnreplicatedWritesBlock unreplicated(opCtx);
DisableDocumentValidation documentValidationDisabler(
opCtx, DocumentValidationSettings::kDisableInternalValidation);
diff --git a/src/mongo/db/op_observer_impl_test.cpp b/src/mongo/db/op_observer_impl_test.cpp
index 373d979feab..d0273e162ea 100644
--- a/src/mongo/db/op_observer_impl_test.cpp
+++ b/src/mongo/db/op_observer_impl_test.cpp
@@ -187,6 +187,7 @@ public:
reset(opCtx, NamespaceString::kRsOplogNamespace);
reset(opCtx, NamespaceString::kSessionTransactionsTableNamespace);
reset(opCtx, NamespaceString::kConfigImagesNamespace);
+ reset(opCtx, NamespaceString::kChangeStreamPreImagesNamespace);
}
protected:
diff --git a/src/mongo/db/pipeline/change_stream_pre_image_helpers.cpp b/src/mongo/db/pipeline/change_stream_pre_image_helpers.cpp
index 01335139a8d..68e2e6949f8 100644
--- a/src/mongo/db/pipeline/change_stream_pre_image_helpers.cpp
+++ b/src/mongo/db/pipeline/change_stream_pre_image_helpers.cpp
@@ -48,7 +48,6 @@ void writeToChangeStreamPreImagesCollection(OperationContext* opCtx,
// This lock acquisition can block on a stronger lock held by another operation modifying the
// pre-images collection. There are no known cases where an operation holding an exclusive lock
// on the pre-images collection also waits for oplog visibility.
- repl::UnreplicatedWritesBlock unreplicated(opCtx);
AllowLockAcquisitionOnTimestampedUnitOfWork allowLockAcquisition(opCtx->lockState());
AutoGetCollection preimagesCollectionRaii(opCtx, collectionNamespace, LockMode::MODE_IX);
UpdateResult res = Helpers::upsert(opCtx, collectionNamespace.toString(), preImage.toBSON());
diff --git a/src/mongo/db/repl/dbcheck.cpp b/src/mongo/db/repl/dbcheck.cpp
index 79a98205972..cb909bbd08e 100644
--- a/src/mongo/db/repl/dbcheck.cpp
+++ b/src/mongo/db/repl/dbcheck.cpp
@@ -206,8 +206,7 @@ std::unique_ptr<HealthLogEntry> dbCheckBatchEntry(
}
// Implcitily replicated collections and capped collections not replicating truncation are
// not designed to be consistent, so inconsistency is not necessarily pathological.
- if (nss.isChangeStreamPreImagesCollection() || nss.isConfigImagesCollection() ||
- (options && options->capped)) {
+ if (nss.isImplicitlyReplicated() || (options && options->capped)) {
return SeverityEnum::Warning;
}
diff --git a/src/mongo/db/repl/oplog.cpp b/src/mongo/db/repl/oplog.cpp
index ea9b83dc41b..ec5933b830d 100644
--- a/src/mongo/db/repl/oplog.cpp
+++ b/src/mongo/db/repl/oplog.cpp
@@ -297,7 +297,6 @@ void writeToImageCollection(OperationContext* opCtx,
request.setFromOplogApplication(true);
try {
// This code path can also be hit by things such as `applyOps` and tenant migrations.
- repl::UnreplicatedWritesBlock dontReplicate(opCtx);
::mongo::update(opCtx, autoColl.getDb(), request);
} catch (const ExceptionFor<ErrorCodes::DuplicateKey>&) {
// We can get a duplicate key when two upserts race on inserting a document.
diff --git a/src/mongo/db/session_catalog.h b/src/mongo/db/session_catalog.h
index 7bed6ae91a3..bcf696042b7 100644
--- a/src/mongo/db/session_catalog.h
+++ b/src/mongo/db/session_catalog.h
@@ -59,6 +59,8 @@ class SessionCatalog {
friend class OperationContextSession;
public:
+ static constexpr StringData kInternalSessionsCleanupClient = "InternalSessionsCleanup"_sd;
+
class ScopedCheckedOutSession;
class SessionToKill;