diff options
author | Josef Ahmad <josef.ahmad@mongodb.com> | 2022-01-13 11:21:43 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-01-13 12:12:05 +0000 |
commit | 5a1dbe698bf88ef3f0ce2348c705d6a042c32011 (patch) | |
tree | 770e9b2da7fdb55d4b204d69b7b72334e8ccccc8 | |
parent | 546d77f0945fabc0fc485e762bd7548871ffc601 (diff) | |
download | mongo-5a1dbe698bf88ef3f0ce2348c705d6a042c32011.tar.gz |
SERVER-61709 Introduce implicitly replicated namespaces
Implicitly replicated namespaces are internal namespaces that do not replicate
writes, with the exception of deletions, user-initiated direct writes and some
maintenance operations.
This patch lists config.system.preimages, config.images_collection,
config.transactions and config.changes.* as implicitly replicated namespaces,
and unifies and validates their semantics. It also special-cases some of the
config.transactions replication behaviour that is too specific to be generalised.
-rw-r--r-- | jstests/core/write_change_stream_pit_preimage.js | 5 | ||||
-rw-r--r-- | jstests/noPassthrough/change_streams_pre_image_removal_job.js | 10 | ||||
-rw-r--r-- | jstests/noPassthrough/store_retryable_find_and_modify_images_in_side_collection.js | 4 | ||||
-rw-r--r-- | jstests/sharding/internal_sessions_reaping.js | 8 | ||||
-rw-r--r-- | src/mongo/db/commands/set_feature_compatibility_version_command.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/exec/update_stage.cpp | 12 | ||||
-rw-r--r-- | src/mongo/db/exec/upsert_stage.cpp | 9 | ||||
-rw-r--r-- | src/mongo/db/namespace_string.cpp | 16 | ||||
-rw-r--r-- | src/mongo/db/namespace_string.h | 17 | ||||
-rw-r--r-- | src/mongo/db/op_observer_impl.cpp | 1 | ||||
-rw-r--r-- | src/mongo/db/op_observer_impl_test.cpp | 1 | ||||
-rw-r--r-- | src/mongo/db/pipeline/change_stream_pre_image_helpers.cpp | 1 | ||||
-rw-r--r-- | src/mongo/db/repl/dbcheck.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/repl/oplog.cpp | 1 | ||||
-rw-r--r-- | src/mongo/db/session_catalog.h | 2 |
15 files changed, 86 insertions, 7 deletions
diff --git a/jstests/core/write_change_stream_pit_preimage.js b/jstests/core/write_change_stream_pit_preimage.js index baccdbd3109..4c463899515 100644 --- a/jstests/core/write_change_stream_pit_preimage.js +++ b/jstests/core/write_change_stream_pit_preimage.js @@ -48,6 +48,11 @@ function assertPreImagesWrittenForOps(db, ops, expectedPreImages) { assert.eq(writtenPreImages[idx].preImage, expectedPreImages[idx]); assertValidChangeStreamPreImageDocument(writtenPreImages[idx]); } + + // Because the pre-images collection is implicitly replicated, validate that writes do not + // generate oplog entries, with the exception of deletions. + assert.eq(0, + localDB.oplog.rs.find({op: {'$ne': 'd'}, ns: 'config.system.preimages'}).itcount()); } // Validates that no pre-image is written while performing ops. diff --git a/jstests/noPassthrough/change_streams_pre_image_removal_job.js b/jstests/noPassthrough/change_streams_pre_image_removal_job.js index cb9ba2e8c17..6d7ca7253ee 100644 --- a/jstests/noPassthrough/change_streams_pre_image_removal_job.js +++ b/jstests/noPassthrough/change_streams_pre_image_removal_job.js @@ -41,6 +41,7 @@ rst.startSet({setParameter: {expiredChangeStreamPreImageRemovalJobSleepSecs: 1}} rst.initiate(); const primaryNode = rst.getPrimary(); const testDB = primaryNode.getDB(jsTestName()); +const localDB = primaryNode.getDB("local"); const collA = assertCreateCollection(testDB, "collA", {changeStreamPreAndPostImages: {enabled: true}}); const collB = @@ -59,7 +60,8 @@ for (const coll of [collA, collB]) { // Pre-images collection should contain four pre-images. preImages = getPreImages(primaryNode); -assert.eq(preImages.length, 4, preImages); +const preImagesToExpire = 4; +assert.eq(preImages.length, preImagesToExpire, preImages); // Roll over all current oplog entries. const lastOplogEntryToBeRemoved = getLatestOp(primaryNode); @@ -96,6 +98,12 @@ assert.soon(() => { return onlyTwoPreImagesLeft && allPreImagesHaveBiggerTimestamp; }); +// Because the pre-images collection is implicitly replicated, validate that writes do not generate +// oplog entries, with the exception of deletions. +const preimagesNs = 'config.system.preimages'; +assert.eq(preImagesToExpire, localDB.oplog.rs.find({op: 'd', ns: preimagesNs}).itcount()); +assert.eq(0, localDB.oplog.rs.find({op: {'$ne': 'd'}, ns: preimagesNs}).itcount()); + // Verify that pre-images collection content on the primary node is the same as on the // secondary. rst.awaitReplication(); diff --git a/jstests/noPassthrough/store_retryable_find_and_modify_images_in_side_collection.js b/jstests/noPassthrough/store_retryable_find_and_modify_images_in_side_collection.js index 67bb79af9ae..a54b2cf0fb0 100644 --- a/jstests/noPassthrough/store_retryable_find_and_modify_images_in_side_collection.js +++ b/jstests/noPassthrough/store_retryable_find_and_modify_images_in_side_collection.js @@ -336,6 +336,10 @@ function runTests(lsid, retryRes = assert.commandWorked(mainConn.getDB('test').runCommand(cmd)); assertRetryCommand(res, retryRes); + // Because the config.image_collection table is implicitly replicated, validate that writes do + // not generate oplog entries, with the exception of deletions. + assert.eq(0, oplog.find({ns: "config.image_collection", op: {'$ne': 'd'}}).itcount()); + assert(mainConn.getDB('test').user.drop()); } diff --git a/jstests/sharding/internal_sessions_reaping.js b/jstests/sharding/internal_sessions_reaping.js index 4da85811378..26267ffbd78 100644 --- a/jstests/sharding/internal_sessions_reaping.js +++ b/jstests/sharding/internal_sessions_reaping.js @@ -33,10 +33,12 @@ const kCollName = "testColl"; const kConfigSessionsNs = "config.system.sessions"; const kConfigTxnsNs = "config.transactions"; const kImageCollNs = "config.image_collection"; +const kOplogCollNs = "local.oplog.rs"; let sessionsCollOnPrimary = shard0Primary.getCollection(kConfigSessionsNs); let transactionsCollOnPrimary = shard0Primary.getCollection(kConfigTxnsNs); let imageCollOnPrimary = shard0Primary.getCollection(kImageCollNs); +let oplogCollOnPrimary = shard0Primary.getCollection(kOplogCollNs); let testDB = shard0Primary.getDB(kDbName); assert.commandWorked(testDB.createCollection(kCollName)); @@ -181,5 +183,11 @@ assert.eq(0, tojson(transactionsCollOnPrimary.find().toArray())); assert.eq(0, imageCollOnPrimary.find().itcount()); +// Because the config.transactions is implicitly replicated, validate that writes do not generate +// oplog entries, with the exception of deletions. +assert.eq(numTransactionsCollEntries, + oplogCollOnPrimary.find({op: 'd', ns: kConfigTxnsNs}).itcount()); +assert.eq(0, oplogCollOnPrimary.find({op: {'$ne': 'd'}, ns: kConfigTxnsNs}).itcount()); + st.stop(); })(); diff --git a/src/mongo/db/commands/set_feature_compatibility_version_command.cpp b/src/mongo/db/commands/set_feature_compatibility_version_command.cpp index d7d6d9f6d8e..a1aa4f78199 100644 --- a/src/mongo/db/commands/set_feature_compatibility_version_command.cpp +++ b/src/mongo/db/commands/set_feature_compatibility_version_command.cpp @@ -711,7 +711,8 @@ private: // Due to the possibility that the shell or drivers have implicit sessions enabled, we // cannot write to the config.transactions collection while we're in a session. So we // construct a temporary client to as a work around. - auto newClient = opCtx->getServiceContext()->makeClient("InternalSessionsCleanup"); + auto newClient = opCtx->getServiceContext()->makeClient( + SessionCatalog::kInternalSessionsCleanupClient.toString()); { stdx::lock_guard<Client> lk(*newClient.get()); diff --git a/src/mongo/db/exec/update_stage.cpp b/src/mongo/db/exec/update_stage.cpp index bda0a81078b..d6e53a345ef 100644 --- a/src/mongo/db/exec/update_stage.cpp +++ b/src/mongo/db/exec/update_stage.cpp @@ -52,6 +52,7 @@ #include "mongo/db/s/sharding_state.h" #include "mongo/db/s/sharding_write_router.h" #include "mongo/db/service_context.h" +#include "mongo/db/session_catalog.h" #include "mongo/db/storage/duplicate_key_error_info.h" #include "mongo/db/update/path_support.h" #include "mongo/db/update/storage_validation.h" @@ -372,6 +373,17 @@ PlanStage::StageState UpdateStage::doWork(WorkingSetID* out) { return PlanStage::IS_EOF; } + boost::optional<repl::UnreplicatedWritesBlock> unReplBlock; + const auto isSessionCleanupClient = + opCtx()->getClient()->desc() == SessionCatalog::kInternalSessionsCleanupClient; + if (collection()->ns().isImplicitlyReplicated() && !_isUserInitiatedWrite && + !isSessionCleanupClient) { + // Implictly replicated collections do not replicate updates. + // However, user-initiated writes and some background maintenance tasks are allowed + // to replicate as they cannot be derived from the oplog. + unReplBlock.emplace(opCtx()); + } + // It is possible that after an update was applied, a WriteConflictException // occurred and prevented us from returning ADVANCED with the requested version // of the document. diff --git a/src/mongo/db/exec/upsert_stage.cpp b/src/mongo/db/exec/upsert_stage.cpp index a6f6bf73d04..f1b684a544d 100644 --- a/src/mongo/db/exec/upsert_stage.cpp +++ b/src/mongo/db/exec/upsert_stage.cpp @@ -35,6 +35,7 @@ #include "mongo/db/curop_failpoint_helpers.h" #include "mongo/db/query/query_feature_flags_gen.h" #include "mongo/db/s/operation_sharding_state.h" +#include "mongo/db/session_catalog.h" #include "mongo/db/update/storage_validation.h" #include "mongo/s/would_change_owning_shard_exception.h" @@ -72,6 +73,14 @@ PlanStage::StageState UpsertStage::doWork(WorkingSetID* out) { return StageState::IS_EOF; } + boost::optional<repl::UnreplicatedWritesBlock> unReplBlock; + const auto isSessionCleanupClient = + opCtx()->getClient()->desc() == SessionCatalog::kInternalSessionsCleanupClient; + if (collection()->ns().isImplicitlyReplicated() && !isSessionCleanupClient) { + // Implictly replicated collections do not replicate updates. + unReplBlock.emplace(opCtx()); + } + // First, attempt to perform the update on a matching document. auto updateState = UpdateStage::doWork(out); diff --git a/src/mongo/db/namespace_string.cpp b/src/mongo/db/namespace_string.cpp index 631177c4d63..f1e258ba2ce 100644 --- a/src/mongo/db/namespace_string.cpp +++ b/src/mongo/db/namespace_string.cpp @@ -347,6 +347,10 @@ bool NamespaceString::isConfigImagesCollection() const { return ns() == kConfigImagesNamespace.ns(); } +bool NamespaceString::isConfigTransactionsCollection() const { + return ns() == kSessionTransactionsTableNamespace.ns(); +} + NamespaceString NamespaceString::makeTimeseriesBucketsNamespace() const { return {db(), kTimeseriesBucketsCollectionPrefix.toString() + coll()}; } @@ -356,6 +360,18 @@ NamespaceString NamespaceString::getTimeseriesViewNamespace() const { return {db(), coll().substr(kTimeseriesBucketsCollectionPrefix.size())}; } +bool NamespaceString::isImplicitlyReplicated() const { + if (isChangeStreamPreImagesCollection() || isConfigImagesCollection() || + isConfigTransactionsCollection() || isChangeCollection()) { + // Implicitly replicated namespaces are replicated, although they only replicate a subset of + // writes. + invariant(isReplicated()); + return true; + } + + return false; +} + bool NamespaceString::isReplicated() const { if (isLocal()) { return false; diff --git a/src/mongo/db/namespace_string.h b/src/mongo/db/namespace_string.h index 17ea8855bc7..339c62ce23f 100644 --- a/src/mongo/db/namespace_string.h +++ b/src/mongo/db/namespace_string.h @@ -307,6 +307,9 @@ public: bool isSystemDotProfile() const { return coll() == "system.profile"; } + bool isChangeCollection() const { + return (db() == kConfigDb) && coll().startsWith("changes."); + } bool isSystemDotViews() const { return coll() == kSystemDotViewsCollectionName; } @@ -384,6 +387,11 @@ public: bool isConfigImagesCollection() const; /** + * Returns whether the specified namespace is config.transactions. + */ + bool isConfigTransactionsCollection() const; + + /** * Returns the time-series buckets namespace for this view. */ NamespaceString makeTimeseriesBucketsNamespace() const; @@ -394,6 +402,15 @@ public: NamespaceString getTimeseriesViewNamespace() const; /** + * Returns whether the namespace is implicitly replicated, based only on its string value. + * + * An implicitly replicated namespace is an internal namespace which does not replicate writes + * via the oplog, with the exception of deletions. Writes are not replicated as an optimization + * because their content can be reliably derived from entries in the oplog. + */ + bool isImplicitlyReplicated() const; + + /** * Returns whether a namespace is replicated, based only on its string value. One notable * omission is that map reduce `tmp.mr` collections may or may not be replicated. Callers must * decide how to handle that case separately. diff --git a/src/mongo/db/op_observer_impl.cpp b/src/mongo/db/op_observer_impl.cpp index 398c577df06..51bf478673f 100644 --- a/src/mongo/db/op_observer_impl.cpp +++ b/src/mongo/db/op_observer_impl.cpp @@ -297,7 +297,6 @@ void writeToImageCollection(OperationContext* opCtx, imageEntry.setImageKind(imageKind); imageEntry.setImage(dataImage); - repl::UnreplicatedWritesBlock unreplicated(opCtx); DisableDocumentValidation documentValidationDisabler( opCtx, DocumentValidationSettings::kDisableInternalValidation); diff --git a/src/mongo/db/op_observer_impl_test.cpp b/src/mongo/db/op_observer_impl_test.cpp index 373d979feab..d0273e162ea 100644 --- a/src/mongo/db/op_observer_impl_test.cpp +++ b/src/mongo/db/op_observer_impl_test.cpp @@ -187,6 +187,7 @@ public: reset(opCtx, NamespaceString::kRsOplogNamespace); reset(opCtx, NamespaceString::kSessionTransactionsTableNamespace); reset(opCtx, NamespaceString::kConfigImagesNamespace); + reset(opCtx, NamespaceString::kChangeStreamPreImagesNamespace); } protected: diff --git a/src/mongo/db/pipeline/change_stream_pre_image_helpers.cpp b/src/mongo/db/pipeline/change_stream_pre_image_helpers.cpp index 01335139a8d..68e2e6949f8 100644 --- a/src/mongo/db/pipeline/change_stream_pre_image_helpers.cpp +++ b/src/mongo/db/pipeline/change_stream_pre_image_helpers.cpp @@ -48,7 +48,6 @@ void writeToChangeStreamPreImagesCollection(OperationContext* opCtx, // This lock acquisition can block on a stronger lock held by another operation modifying the // pre-images collection. There are no known cases where an operation holding an exclusive lock // on the pre-images collection also waits for oplog visibility. - repl::UnreplicatedWritesBlock unreplicated(opCtx); AllowLockAcquisitionOnTimestampedUnitOfWork allowLockAcquisition(opCtx->lockState()); AutoGetCollection preimagesCollectionRaii(opCtx, collectionNamespace, LockMode::MODE_IX); UpdateResult res = Helpers::upsert(opCtx, collectionNamespace.toString(), preImage.toBSON()); diff --git a/src/mongo/db/repl/dbcheck.cpp b/src/mongo/db/repl/dbcheck.cpp index 79a98205972..cb909bbd08e 100644 --- a/src/mongo/db/repl/dbcheck.cpp +++ b/src/mongo/db/repl/dbcheck.cpp @@ -206,8 +206,7 @@ std::unique_ptr<HealthLogEntry> dbCheckBatchEntry( } // Implcitily replicated collections and capped collections not replicating truncation are // not designed to be consistent, so inconsistency is not necessarily pathological. - if (nss.isChangeStreamPreImagesCollection() || nss.isConfigImagesCollection() || - (options && options->capped)) { + if (nss.isImplicitlyReplicated() || (options && options->capped)) { return SeverityEnum::Warning; } diff --git a/src/mongo/db/repl/oplog.cpp b/src/mongo/db/repl/oplog.cpp index ea9b83dc41b..ec5933b830d 100644 --- a/src/mongo/db/repl/oplog.cpp +++ b/src/mongo/db/repl/oplog.cpp @@ -297,7 +297,6 @@ void writeToImageCollection(OperationContext* opCtx, request.setFromOplogApplication(true); try { // This code path can also be hit by things such as `applyOps` and tenant migrations. - repl::UnreplicatedWritesBlock dontReplicate(opCtx); ::mongo::update(opCtx, autoColl.getDb(), request); } catch (const ExceptionFor<ErrorCodes::DuplicateKey>&) { // We can get a duplicate key when two upserts race on inserting a document. diff --git a/src/mongo/db/session_catalog.h b/src/mongo/db/session_catalog.h index 7bed6ae91a3..bcf696042b7 100644 --- a/src/mongo/db/session_catalog.h +++ b/src/mongo/db/session_catalog.h @@ -59,6 +59,8 @@ class SessionCatalog { friend class OperationContextSession; public: + static constexpr StringData kInternalSessionsCleanupClient = "InternalSessionsCleanup"_sd; + class ScopedCheckedOutSession; class SessionToKill; |