diff options
28 files changed, 312 insertions, 66 deletions
diff --git a/jstests/replsets/buildindexes_false_with_system_indexes.js b/jstests/replsets/buildindexes_false_with_system_indexes.js new file mode 100644 index 00000000000..96fcc732764 --- /dev/null +++ b/jstests/replsets/buildindexes_false_with_system_indexes.js @@ -0,0 +1,86 @@ +/* + * Tests that hidden nodes with buildIndexes: false behave correctly when system tables with + * default indexes are created. + * + */ +(function() { + 'use strict'; + + load("jstests/replsets/rslib.js"); + + const testName = "buildindexes_false_with_system_indexes"; + + let rst = new ReplSetTest({ + name: testName, + nodes: [ + {}, + {rsConfig: {priority: 0}}, + {rsConfig: {priority: 0, hidden: true, buildIndexes: false}}, + ], + }); + const nodes = rst.startSet(); + rst.initiate(); + + let primary = rst.getPrimary(); + assert.eq(primary, nodes[0]); + let secondary = nodes[1]; + const hidden = nodes[2]; + + rst.awaitReplication(); + jsTestLog("Creating a role in the admin database"); + let adminDb = primary.getDB("admin"); + adminDb.createRole( + {role: 'test_role', roles: [{role: 'readWrite', db: 'test'}], privileges: []}); + rst.awaitReplication(); + + jsTestLog("Creating a user in the admin database"); + adminDb.createUser({user: 'test_user', pwd: 'test', roles: [{role: 'test_role', db: 'admin'}]}); + rst.awaitReplication(); + + // Make sure the indexes we expect are present on all nodes. The buildIndexes: false node + // should have only the _id_ index. + let secondaryAdminDb = secondary.getDB("admin"); + const hiddenAdminDb = hidden.getDB("admin"); + + assert.eq(["_id_", "user_1_db_1"], adminDb.system.users.getIndexes().map(x => x.name).sort()); + assert.eq(["_id_", "role_1_db_1"], adminDb.system.roles.getIndexes().map(x => x.name).sort()); + assert.eq(["_id_", "user_1_db_1"], + secondaryAdminDb.system.users.getIndexes().map(x => x.name).sort()); + assert.eq(["_id_", "role_1_db_1"], + secondaryAdminDb.system.roles.getIndexes().map(x => x.name).sort()); + assert.eq(["_id_"], hiddenAdminDb.system.users.getIndexes().map(x => x.name).sort()); + assert.eq(["_id_"], hiddenAdminDb.system.roles.getIndexes().map(x => x.name).sort()); + + // Drop the indexes and restart the secondary. The indexes should not be re-created. + jsTestLog("Dropping system indexes and restarting secondary."); + adminDb.system.users.dropIndex("user_1_db_1"); + adminDb.system.roles.dropIndex("role_1_db_1"); + rst.awaitReplication(); + assert.eq(["_id_"], adminDb.system.users.getIndexes().map(x => x.name).sort()); + assert.eq(["_id_"], adminDb.system.roles.getIndexes().map(x => x.name).sort()); + assert.eq(["_id_"], secondaryAdminDb.system.users.getIndexes().map(x => x.name).sort()); + assert.eq(["_id_"], secondaryAdminDb.system.roles.getIndexes().map(x => x.name).sort()); + assert.eq(["_id_"], hiddenAdminDb.system.users.getIndexes().map(x => x.name).sort()); + assert.eq(["_id_"], hiddenAdminDb.system.roles.getIndexes().map(x => x.name).sort()); + + secondary = rst.restart(secondary, {}, true /* wait for node to become healthy */); + secondaryAdminDb = secondary.getDB("admin"); + assert.eq(["_id_"], secondaryAdminDb.system.users.getIndexes().map(x => x.name).sort()); + assert.eq(["_id_"], secondaryAdminDb.system.roles.getIndexes().map(x => x.name).sort()); + + jsTestLog("Now restarting primary; indexes should be created."); + rst.restart(primary); + primary = rst.getPrimary(); + rst.awaitReplication(); + adminDb = primary.getDB("admin"); + assert.eq(["_id_", "user_1_db_1"], adminDb.system.users.getIndexes().map(x => x.name).sort()); + assert.eq(["_id_", "role_1_db_1"], adminDb.system.roles.getIndexes().map(x => x.name).sort()); + assert.eq(["_id_", "user_1_db_1"], + secondaryAdminDb.system.users.getIndexes().map(x => x.name).sort()); + assert.eq(["_id_", "role_1_db_1"], + secondaryAdminDb.system.roles.getIndexes().map(x => x.name).sort()); + assert.eq(["_id_"], hiddenAdminDb.system.users.getIndexes().map(x => x.name).sort()); + assert.eq(["_id_"], hiddenAdminDb.system.roles.getIndexes().map(x => x.name).sort()); + + rst.stopSet(); +}()); diff --git a/src/mongo/db/catalog/database_impl.cpp b/src/mongo/db/catalog/database_impl.cpp index 5d4ed394e7c..1ed51dcfe9a 100644 --- a/src/mongo/db/catalog/database_impl.cpp +++ b/src/mongo/db/catalog/database_impl.cpp @@ -774,26 +774,38 @@ Collection* DatabaseImpl::createCollection(OperationContext* opCtx, "request doesn't allow collection to be created implicitly", OperationShardingState::get(opCtx).allowImplicitCollectionCreation()); + auto coordinator = repl::ReplicationCoordinator::get(opCtx); + bool canAcceptWrites = + (coordinator->getReplicationMode() != repl::ReplicationCoordinator::modeReplSet) || + coordinator->canAcceptWritesForDatabase(opCtx, nss.db()) || nss.isSystemDotProfile(); + + CollectionOptions optionsWithUUID = options; bool generatedUUID = false; if (!optionsWithUUID.uuid) { - auto coordinator = repl::ReplicationCoordinator::get(opCtx); - bool canGenerateUUID = - (coordinator->getReplicationMode() != repl::ReplicationCoordinator::modeReplSet) || - coordinator->canAcceptWritesForDatabase(opCtx, nss.db()) || nss.isSystemDotProfile(); - - if (!canGenerateUUID) { + if (!canAcceptWrites) { std::string msg = str::stream() << "Attempted to create a new collection " << nss.ns() << " without a UUID"; severe() << msg; uasserted(ErrorCodes::InvalidOptions, msg); } - if (canGenerateUUID) { + if (canAcceptWrites) { optionsWithUUID.uuid.emplace(CollectionUUID::gen()); generatedUUID = true; } } + // Because writing the oplog entry depends on having the full spec for the _id index, which is + // not available until the collection is actually created, we can't write the oplog entry until + // after we have created the collection. In order to make the storage timestamp for the + // collection create always correct even when other operations are present in the same storage + // transaction, we reserve an opTime before the collection creation, then pass it to the + // opObserver. Reserving the optime automatically sets the storage timestamp. + OplogSlot createOplogSlot; + if (canAcceptWrites && supportsDocLocking() && !coordinator->isOplogDisabledFor(opCtx, nss)) { + createOplogSlot = repl::getNextOpTime(opCtx); + } + _checkCanCreateCollection(opCtx, nss, optionsWithUUID); audit::logCreateCollection(&cc(), ns); @@ -833,16 +845,21 @@ Collection* DatabaseImpl::createCollection(OperationContext* opCtx, !nss.isReplicated()); } } - - if (nss.isSystem()) { - createSystemIndexes(opCtx, collection); - } } MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeLoggingCreateCollection); opCtx->getServiceContext()->getOpObserver()->onCreateCollection( - opCtx, collection, nss, optionsWithUUID, fullIdIndexSpec); + opCtx, collection, nss, optionsWithUUID, fullIdIndexSpec, createOplogSlot); + + // It is necessary to create the system index *after* running the onCreateCollection so that + // the storage timestamp for the index creation is after the storage timestamp for the + // collection creation, and the opTimes for the corresponding oplog entries are the same as the + // storage timestamps. This way both primary and any secondaries will see the index created + // after the collection is created. + if (canAcceptWrites && createIdIndex && nss.isSystem()) { + createSystemIndexes(opCtx, collection); + } return collection; } diff --git a/src/mongo/db/catalog/rename_collection_test.cpp b/src/mongo/db/catalog/rename_collection_test.cpp index 300bc94192f..092b1328406 100644 --- a/src/mongo/db/catalog/rename_collection_test.cpp +++ b/src/mongo/db/catalog/rename_collection_test.cpp @@ -88,7 +88,8 @@ public: Collection* coll, const NamespaceString& collectionName, const CollectionOptions& options, - const BSONObj& idIndex) override; + const BSONObj& idIndex, + const OplogSlot& createOpTime) override; repl::OpTime onDropCollection(OperationContext* opCtx, const NamespaceString& collectionName, @@ -165,9 +166,10 @@ void OpObserverMock::onCreateCollection(OperationContext* opCtx, Collection* coll, const NamespaceString& collectionName, const CollectionOptions& options, - const BSONObj& idIndex) { + const BSONObj& idIndex, + const OplogSlot& createOpTime) { _logOp(opCtx, collectionName, "create"); - OpObserverNoop::onCreateCollection(opCtx, coll, collectionName, options, idIndex); + OpObserverNoop::onCreateCollection(opCtx, coll, collectionName, options, idIndex, createOpTime); } repl::OpTime OpObserverMock::onDropCollection(OperationContext* opCtx, diff --git a/src/mongo/db/catalog/uuid_catalog.cpp b/src/mongo/db/catalog/uuid_catalog.cpp index 5ea95f14d2a..781c0748fb2 100644 --- a/src/mongo/db/catalog/uuid_catalog.cpp +++ b/src/mongo/db/catalog/uuid_catalog.cpp @@ -48,7 +48,8 @@ void UUIDCatalogObserver::onCreateCollection(OperationContext* opCtx, Collection* coll, const NamespaceString& collectionName, const CollectionOptions& options, - const BSONObj& idIndex) { + const BSONObj& idIndex, + const OplogSlot& createOpTime) { if (!options.uuid) return; UUIDCatalog& catalog = UUIDCatalog::get(opCtx); diff --git a/src/mongo/db/catalog/uuid_catalog.h b/src/mongo/db/catalog/uuid_catalog.h index cc093b31c06..7b1ace95701 100644 --- a/src/mongo/db/catalog/uuid_catalog.h +++ b/src/mongo/db/catalog/uuid_catalog.h @@ -73,7 +73,8 @@ public: Collection* coll, const NamespaceString& collectionName, const CollectionOptions& options, - const BSONObj& idIndex) override; + const BSONObj& idIndex, + const OplogSlot& createOpTime) override; void onCollMod(OperationContext* opCtx, const NamespaceString& nss, OptionalCollectionUUID uuid, diff --git a/src/mongo/db/commands/dbcheck.cpp b/src/mongo/db/commands/dbcheck.cpp index 3380a7c3198..9c4a4f376d5 100644 --- a/src/mongo/db/commands/dbcheck.cpp +++ b/src/mongo/db/commands/dbcheck.cpp @@ -478,7 +478,8 @@ private: {}, kUninitializedStmtId, {}, - false /* prepare */); + false /* prepare */, + OplogSlot()); uow.commit(); return result; }); diff --git a/src/mongo/db/commands/mr_test.cpp b/src/mongo/db/commands/mr_test.cpp index 9a640137ece..f8c81c473a5 100644 --- a/src/mongo/db/commands/mr_test.cpp +++ b/src/mongo/db/commands/mr_test.cpp @@ -292,7 +292,8 @@ public: Collection* coll, const NamespaceString& collectionName, const CollectionOptions& options, - const BSONObj& idIndex) override; + const BSONObj& idIndex, + const OplogSlot& createOpTime) override; // Hook for onInserts. Defaults to a no-op function but may be overridden to inject exceptions // while mapReduce inserts its results into the temporary output collection. @@ -315,7 +316,8 @@ void MapReduceOpObserver::onCreateCollection(OperationContext*, Collection*, const NamespaceString& collectionName, const CollectionOptions& options, - const BSONObj&) { + const BSONObj&, + const OplogSlot&) { if (!options.temp) { return; } diff --git a/src/mongo/db/db.cpp b/src/mongo/db/db.cpp index 996a0123864..d2282433f7e 100644 --- a/src/mongo/db/db.cpp +++ b/src/mongo/db/db.cpp @@ -475,6 +475,9 @@ ExitCode _initAndListen(int listenPort) { log() << redact(status); if (status == ErrorCodes::AuthSchemaIncompatible) { exitCleanly(EXIT_NEED_UPGRADE); + } else if (status == ErrorCodes::NotMaster) { + // Try creating the indexes if we become master. If we do not become master, + // the master will create the indexes and we will replicate them. } else { quickExit(EXIT_FAILURE); } diff --git a/src/mongo/db/free_mon/free_mon_op_observer.h b/src/mongo/db/free_mon/free_mon_op_observer.h index 05d16a21e4a..02578a32a1f 100644 --- a/src/mongo/db/free_mon/free_mon_op_observer.h +++ b/src/mongo/db/free_mon/free_mon_op_observer.h @@ -80,7 +80,8 @@ public: Collection* coll, const NamespaceString& collectionName, const CollectionOptions& options, - const BSONObj& idIndex) final {} + const BSONObj& idIndex, + const OplogSlot& createOpTime) final {} void onCollMod(OperationContext* opCtx, const NamespaceString& nss, diff --git a/src/mongo/db/op_observer.h b/src/mongo/db/op_observer.h index 38027587930..e042cc3dfc5 100644 --- a/src/mongo/db/op_observer.h +++ b/src/mongo/db/op_observer.h @@ -42,6 +42,7 @@ namespace mongo { struct InsertStatement; class OperationContext; +struct OplogSlot; namespace repl { class OpTime; @@ -148,7 +149,8 @@ public: Collection* coll, const NamespaceString& collectionName, const CollectionOptions& options, - const BSONObj& idIndex) = 0; + const BSONObj& idIndex, + const OplogSlot& createOpTime) = 0; /** * This function logs an oplog entry when a 'collMod' command on a collection is executed. * Since 'collMod' commands can take a variety of different formats, the 'o' field of the diff --git a/src/mongo/db/op_observer_impl.cpp b/src/mongo/db/op_observer_impl.cpp index 7305ff49267..06bf33a1fe6 100644 --- a/src/mongo/db/op_observer_impl.cpp +++ b/src/mongo/db/op_observer_impl.cpp @@ -75,7 +75,8 @@ repl::OpTime logOperation(OperationContext* opCtx, const OperationSessionInfo& sessionInfo, StmtId stmtId, const repl::OplogLink& oplogLink, - bool prepare) { + bool prepare, + const OplogSlot& oplogSlot) { auto& times = OpObserver::Times::get(opCtx).reservedOpTimes; auto opTime = repl::logOp(opCtx, opstr, @@ -88,7 +89,8 @@ repl::OpTime logOperation(OperationContext* opCtx, sessionInfo, stmtId, oplogLink, - prepare); + prepare, + oplogSlot); times.push_back(opTime); return opTime; @@ -198,7 +200,8 @@ OpTimeBundle replLogUpdate(OperationContext* opCtx, sessionInfo, args.stmtId, {}, - false /* prepare */); + false /* prepare */, + OplogSlot()); opTimes.prePostImageOpTime = noteUpdateOpTime; @@ -220,7 +223,8 @@ OpTimeBundle replLogUpdate(OperationContext* opCtx, sessionInfo, args.stmtId, oplogLink, - false /* prepare */); + false /* prepare */, + OplogSlot()); return opTimes; } @@ -259,7 +263,8 @@ OpTimeBundle replLogDelete(OperationContext* opCtx, sessionInfo, stmtId, {}, - false /* prepare */); + false /* prepare */, + OplogSlot()); opTimes.prePostImageOpTime = noteOplog; oplogLink.preImageOpTime = noteOplog; } @@ -276,7 +281,8 @@ OpTimeBundle replLogDelete(OperationContext* opCtx, sessionInfo, stmtId, oplogLink, - false /* prepare */); + false /* prepare */, + OplogSlot()); return opTimes; } @@ -303,7 +309,8 @@ OpTimeBundle replLogApplyOps(OperationContext* opCtx, sessionInfo, stmtId, oplogLink, - prepare); + prepare, + OplogSlot()); return times; } @@ -336,7 +343,8 @@ void OpObserverImpl::onCreateIndex(OperationContext* opCtx, {}, kUninitializedStmtId, {}, - false /* prepare */); + false /* prepare */, + OplogSlot()); } else { logOperation(opCtx, "i", @@ -349,7 +357,8 @@ void OpObserverImpl::onCreateIndex(OperationContext* opCtx, {}, kUninitializedStmtId, {}, - false /* prepare */); + false /* prepare */, + OplogSlot()); } AuthorizationManager::get(opCtx->getServiceContext()) @@ -574,14 +583,16 @@ void OpObserverImpl::onInternalOpMessage(OperationContext* opCtx, {}, kUninitializedStmtId, {}, - false /* prepare */); + false /* prepare */, + OplogSlot()); } void OpObserverImpl::onCreateCollection(OperationContext* opCtx, Collection* coll, const NamespaceString& collectionName, const CollectionOptions& options, - const BSONObj& idIndex) { + const BSONObj& idIndex, + const OplogSlot& createOpTime) { const auto cmdNss = collectionName.getCommandNS(); BSONObjBuilder b; @@ -618,7 +629,8 @@ void OpObserverImpl::onCreateCollection(OperationContext* opCtx, {}, kUninitializedStmtId, {}, - false /* prepare */); + false /* prepare */, + createOpTime); } AuthorizationManager::get(opCtx->getServiceContext()) @@ -665,7 +677,8 @@ void OpObserverImpl::onCollMod(OperationContext* opCtx, {}, kUninitializedStmtId, {}, - false /* prepare */); + false /* prepare */, + OplogSlot()); } AuthorizationManager::get(opCtx->getServiceContext()) @@ -702,7 +715,8 @@ void OpObserverImpl::onDropDatabase(OperationContext* opCtx, const std::string& {}, kUninitializedStmtId, {}, - false /* prepare */); + false /* prepare */, + OplogSlot()); uassert( 50714, "dropping the admin database is not allowed.", dbName != NamespaceString::kAdminDb); @@ -736,7 +750,8 @@ repl::OpTime OpObserverImpl::onDropCollection(OperationContext* opCtx, {}, kUninitializedStmtId, {}, - false /* prepare */); + false /* prepare */, + OplogSlot()); } uassert(50715, @@ -777,7 +792,8 @@ void OpObserverImpl::onDropIndex(OperationContext* opCtx, {}, kUninitializedStmtId, {}, - false /* prepare */); + false /* prepare */, + OplogSlot()); AuthorizationManager::get(opCtx->getServiceContext()) ->logOp(opCtx, "c", cmdNss, cmdObj, &indexInfo); @@ -813,7 +829,8 @@ repl::OpTime OpObserverImpl::preRenameCollection(OperationContext* const opCtx, {}, kUninitializedStmtId, {}, - false /* prepare */); + false /* prepare */, + OplogSlot()); return {}; } @@ -894,7 +911,8 @@ void OpObserverImpl::onEmptyCapped(OperationContext* opCtx, {}, kUninitializedStmtId, {}, - false /* prepare */); + false /* prepare */, + OplogSlot()); } AuthorizationManager::get(opCtx->getServiceContext()) diff --git a/src/mongo/db/op_observer_impl.h b/src/mongo/db/op_observer_impl.h index fb67efbe104..67a645332a4 100644 --- a/src/mongo/db/op_observer_impl.h +++ b/src/mongo/db/op_observer_impl.h @@ -69,7 +69,8 @@ public: Collection* coll, const NamespaceString& collectionName, const CollectionOptions& options, - const BSONObj& idIndex) override; + const BSONObj& idIndex, + const OplogSlot& createOpTime) override; void onCollMod(OperationContext* opCtx, const NamespaceString& nss, OptionalCollectionUUID uuid, diff --git a/src/mongo/db/op_observer_noop.h b/src/mongo/db/op_observer_noop.h index 5d74fa914af..9e1a496660a 100644 --- a/src/mongo/db/op_observer_noop.h +++ b/src/mongo/db/op_observer_noop.h @@ -64,7 +64,8 @@ public: Collection* coll, const NamespaceString& collectionName, const CollectionOptions& options, - const BSONObj& idIndex) override {} + const BSONObj& idIndex, + const OplogSlot& createOpTime) override {} void onCollMod(OperationContext* opCtx, const NamespaceString& nss, OptionalCollectionUUID uuid, diff --git a/src/mongo/db/op_observer_registry.h b/src/mongo/db/op_observer_registry.h index 2ab2b971219..2d322329c99 100644 --- a/src/mongo/db/op_observer_registry.h +++ b/src/mongo/db/op_observer_registry.h @@ -116,10 +116,11 @@ public: Collection* coll, const NamespaceString& collectionName, const CollectionOptions& options, - const BSONObj& idIndex) override { + const BSONObj& idIndex, + const OplogSlot& createOpTime) override { ReservedTimes times{opCtx}; for (auto& o : _observers) - o->onCreateCollection(opCtx, coll, collectionName, options, idIndex); + o->onCreateCollection(opCtx, coll, collectionName, options, idIndex, createOpTime); } void onCollMod(OperationContext* const opCtx, diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript index a1b502e935a..e9a4998db59 100644 --- a/src/mongo/db/repl/SConscript +++ b/src/mongo/db/repl/SConscript @@ -1576,6 +1576,7 @@ env.Library( '$BUILD_DIR/mongo/db/s/sharding_runtime_d', '$BUILD_DIR/mongo/db/service_context', '$BUILD_DIR/mongo/db/stats/counters', + '$BUILD_DIR/mongo/db/system_index', '$BUILD_DIR/mongo/rpc/client_metadata', '$BUILD_DIR/mongo/util/fail_point', 'bgsync', diff --git a/src/mongo/db/repl/oplog.cpp b/src/mongo/db/repl/oplog.cpp index a0d13182694..42f471bedd4 100644 --- a/src/mongo/db/repl/oplog.cpp +++ b/src/mongo/db/repl/oplog.cpp @@ -434,7 +434,8 @@ OpTime logOp(OperationContext* opCtx, const OperationSessionInfo& sessionInfo, StmtId statementId, const OplogLink& oplogLink, - bool prepare) { + bool prepare, + const OplogSlot& oplogSlot) { auto replCoord = ReplicationCoordinator::get(opCtx); // For commands, the test below is on the command ns and therefore does not check for // specific namespaces such as system.profile. This is the caller's responsibility. @@ -453,7 +454,11 @@ OpTime logOp(OperationContext* opCtx, OplogSlot slot; WriteUnitOfWork wuow(opCtx); - _getNextOpTimes(opCtx, oplog, 1, &slot); + if (oplogSlot.opTime.isNull()) { + _getNextOpTimes(opCtx, oplog, 1, &slot); + } else { + slot = oplogSlot; + } auto writer = _logOpWriter(opCtx, opstr, diff --git a/src/mongo/db/repl/oplog.h b/src/mongo/db/repl/oplog.h index b4d34ca0ef6..ef96a304626 100644 --- a/src/mongo/db/repl/oplog.h +++ b/src/mongo/db/repl/oplog.h @@ -127,6 +127,7 @@ std::vector<OpTime> logInsertOps(OperationContext* opCtx, * linked via prevTs, and the timestamps of the oplog entry that contains the document * before/after update was applied. The timestamps are ignored if isNull() is true. * prepare this specifies if the oplog entry should be put into a 'prepare' state. + * oplogSlot If non-null, use this reserved oplog slot instead of a new one. * * Returns the optime of the oplog entry written to the oplog. * Returns a null optime if oplog was not modified. @@ -142,7 +143,8 @@ OpTime logOp(OperationContext* opCtx, const OperationSessionInfo& sessionInfo, StmtId stmtId, const OplogLink& oplogLink, - bool prepare); + bool prepare, + const OplogSlot& oplogSlot); // Flush out the cached pointer to the oplog. // Used by the closeDatabase command to ensure we don't cache closed things. diff --git a/src/mongo/db/repl/oplog_test.cpp b/src/mongo/db/repl/oplog_test.cpp index 93738bb47ed..4f0de89c5f8 100644 --- a/src/mongo/db/repl/oplog_test.cpp +++ b/src/mongo/db/repl/oplog_test.cpp @@ -111,7 +111,8 @@ TEST_F(OplogTest, LogOpReturnsOpTimeOnSuccessfulInsertIntoOplogCollection) { {}, kUninitializedStmtId, {}, - false /* prepare */); + false /* prepare */, + OplogSlot()); ASSERT_FALSE(opTime.isNull()); wunit.commit(); } @@ -234,7 +235,8 @@ OpTime _logOpNoopWithMsg(OperationContext* opCtx, {}, kUninitializedStmtId, {}, - false /* prepare */); + false /* prepare */, + OplogSlot()); ASSERT_FALSE(opTime.isNull()); ASSERT(opTimeNssMap->find(opTime) == opTimeNssMap->end()) diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index 139d23183ab..dfd54fb93f3 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -79,6 +79,7 @@ #include "mongo/db/service_context.h" #include "mongo/db/session_catalog.h" #include "mongo/db/storage/storage_engine.h" +#include "mongo/db/system_index.h" #include "mongo/executor/network_connection_hook.h" #include "mongo/executor/network_interface.h" #include "mongo/executor/network_interface_factory.h" @@ -519,6 +520,16 @@ OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationC _shardingOnTransitionToPrimaryHook(opCtx); _dropAllTempCollections(opCtx); + // It is only necessary to check the system indexes on the first transition to master. + // On subsequent transitions to master the indexes will have already been created. + static std::once_flag verifySystemIndexesOnce; + std::call_once(verifySystemIndexesOnce, [opCtx] { + const auto globalAuthzManager = AuthorizationManager::get(opCtx->getServiceContext()); + if (globalAuthzManager->shouldValidateAuthSchemaOnStartup()) { + fassert(65536, verifySystemIndexes(opCtx)); + } + }); + serverGlobalParams.validateFeaturesAsMaster.store(true); return opTimeToReturn; diff --git a/src/mongo/db/repl/sync_tail_test_fixture.cpp b/src/mongo/db/repl/sync_tail_test_fixture.cpp index dc1ed3dec95..3929aa5239d 100644 --- a/src/mongo/db/repl/sync_tail_test_fixture.cpp +++ b/src/mongo/db/repl/sync_tail_test_fixture.cpp @@ -78,7 +78,8 @@ void SyncTailOpObserver::onCreateCollection(OperationContext* opCtx, Collection* coll, const NamespaceString& collectionName, const CollectionOptions& options, - const BSONObj& idIndex) { + const BSONObj& idIndex, + const OplogSlot& createOpTime) { if (!onCreateCollectionFn) { return; } diff --git a/src/mongo/db/repl/sync_tail_test_fixture.h b/src/mongo/db/repl/sync_tail_test_fixture.h index cbbe56b94b5..4f483d26eb6 100644 --- a/src/mongo/db/repl/sync_tail_test_fixture.h +++ b/src/mongo/db/repl/sync_tail_test_fixture.h @@ -74,7 +74,8 @@ public: Collection* coll, const NamespaceString& collectionName, const CollectionOptions& options, - const BSONObj& idIndex) override; + const BSONObj& idIndex, + const OplogSlot& createOpTime) override; // Hooks for OpObserver functions. Defaults to a no-op function but may be overridden to check // actual documents mutated. diff --git a/src/mongo/db/s/config_server_op_observer.h b/src/mongo/db/s/config_server_op_observer.h index 91870bf4732..234d6b2ad60 100644 --- a/src/mongo/db/s/config_server_op_observer.h +++ b/src/mongo/db/s/config_server_op_observer.h @@ -80,7 +80,8 @@ public: Collection* coll, const NamespaceString& collectionName, const CollectionOptions& options, - const BSONObj& idIndex) override {} + const BSONObj& idIndex, + const OplogSlot& createOpTime) override {} void onCollMod(OperationContext* opCtx, const NamespaceString& nss, diff --git a/src/mongo/db/s/session_catalog_migration_destination.cpp b/src/mongo/db/s/session_catalog_migration_destination.cpp index 0c318ba7605..68c62abbce2 100644 --- a/src/mongo/db/s/session_catalog_migration_destination.cpp +++ b/src/mongo/db/s/session_catalog_migration_destination.cpp @@ -282,7 +282,8 @@ ProcessOplogResult processSessionOplog(OperationContext* opCtx, sessionInfo, stmtId, oplogLink, - false /* prepare */); + false /* prepare */, + OplogSlot()); auto oplogOpTime = result.oplogTime; uassert(40633, diff --git a/src/mongo/db/s/shard_server_op_observer.h b/src/mongo/db/s/shard_server_op_observer.h index 413373446bd..05b87bdb238 100644 --- a/src/mongo/db/s/shard_server_op_observer.h +++ b/src/mongo/db/s/shard_server_op_observer.h @@ -81,7 +81,8 @@ public: Collection* coll, const NamespaceString& collectionName, const CollectionOptions& options, - const BSONObj& idIndex) override {} + const BSONObj& idIndex, + const OplogSlot& createOpTime) override {} void onCollMod(OperationContext* opCtx, const NamespaceString& nss, diff --git a/src/mongo/db/session_test.cpp b/src/mongo/db/session_test.cpp index 248da45d091..5052e0150bd 100644 --- a/src/mongo/db/session_test.cpp +++ b/src/mongo/db/session_test.cpp @@ -159,7 +159,8 @@ protected: osi, stmtId, link, - false /* prepare */); + false /* prepare */, + OplogSlot()); } void bumpTxnNumberFromDifferentOpCtx(Session* session, TxnNumber newTxnNum) { @@ -554,7 +555,8 @@ TEST_F(SessionTest, ErrorOnlyWhenStmtIdBeingCheckedIsNotInCache) { osi, 1, {}, - false /* prepare */); + false /* prepare */, + OplogSlot()); session.onWriteOpCompletedOnPrimary(opCtx(), txnNum, {1}, opTime, wallClockTime); wuow.commit(); @@ -581,7 +583,8 @@ TEST_F(SessionTest, ErrorOnlyWhenStmtIdBeingCheckedIsNotInCache) { osi, kIncompleteHistoryStmtId, link, - false /* prepare */); + false /* prepare */, + OplogSlot()); session.onWriteOpCompletedOnPrimary( opCtx(), txnNum, {kIncompleteHistoryStmtId}, opTime, wallClockTime); diff --git a/src/mongo/db/system_index.cpp b/src/mongo/db/system_index.cpp index f1e2992ec70..d662a718e10 100644 --- a/src/mongo/db/system_index.cpp +++ b/src/mongo/db/system_index.cpp @@ -105,6 +105,15 @@ void generateSystemIndexForExistingCollection(OperationContext* opCtx, return; } + // Do not try to generate any system indexes on a secondary. + auto replCoord = repl::ReplicationCoordinator::get(opCtx); + uassert(ErrorCodes::NotMaster, + "Not primary while creating authorization index", + replCoord->getReplicationMode() != repl::ReplicationCoordinator::modeReplSet || + replCoord->canAcceptWritesForDatabase(opCtx, ns.db())); + + invariant(!opCtx->lockState()->inAWriteUnitOfWork()); + try { auto indexSpecStatus = index_key_validate::validateIndexSpec( opCtx, spec.toBSON(), ns, serverGlobalParams.featureCompatibility); @@ -124,7 +133,10 @@ void generateSystemIndexForExistingCollection(OperationContext* opCtx, writeConflictRetry(opCtx, "authorization index regeneration", ns.ns(), [&] { WriteUnitOfWork wunit(opCtx); - indexer.commit(); + indexer.commit([opCtx, &ns, collection](const BSONObj& spec) { + opCtx->getServiceContext()->getOpObserver()->onCreateIndex( + opCtx, ns, collection->uuid(), spec, false /* fromMigrate */); + }); wunit.commit(); }); @@ -203,25 +215,29 @@ Status verifySystemIndexes(OperationContext* opCtx) { void createSystemIndexes(OperationContext* opCtx, Collection* collection) { invariant(collection); const NamespaceString& ns = collection->ns(); + BSONObj indexSpec; if (ns == AuthorizationManager::usersCollectionNamespace) { - auto indexSpec = + indexSpec = fassert(40455, index_key_validate::validateIndexSpec(opCtx, v3SystemUsersIndexSpec.toBSON(), ns, serverGlobalParams.featureCompatibility)); - fassert(40456, - collection->getIndexCatalog()->createIndexOnEmptyCollection(opCtx, indexSpec)); } else if (ns == AuthorizationManager::rolesCollectionNamespace) { - auto indexSpec = + indexSpec = fassert(40457, index_key_validate::validateIndexSpec(opCtx, v3SystemRolesIndexSpec.toBSON(), ns, serverGlobalParams.featureCompatibility)); - - fassert(40458, + } + if (!indexSpec.isEmpty()) { + opCtx->getServiceContext()->getOpObserver()->onCreateIndex( + opCtx, ns, collection->uuid(), indexSpec, false /* fromMigrate */); + // Note that the opObserver is called prior to creating the index. This ensures the index + // write gets the same storage timestamp as the oplog entry. + fassert(40456, collection->getIndexCatalog()->createIndexOnEmptyCollection(opCtx, indexSpec)); } } diff --git a/src/mongo/dbtests/storage_timestamp_tests.cpp b/src/mongo/dbtests/storage_timestamp_tests.cpp index 7d558c814a9..0a26e2d3002 100644 --- a/src/mongo/dbtests/storage_timestamp_tests.cpp +++ b/src/mongo/dbtests/storage_timestamp_tests.cpp @@ -2260,6 +2260,65 @@ public: } }; +class CreateCollectionWithSystemIndex : public StorageTimestampTest { +public: + void run() { + // Only run on 'wiredTiger'. No other storage engines to-date support timestamp writes. + if (mongo::storageGlobalParams.engine != "wiredTiger") { + return; + } + + const LogicalTime indexCreateLt = futureLt.addTicks(1); + const Timestamp indexCreateTs = indexCreateLt.asTimestamp(); + + NamespaceString nss("admin.system.users"); + + { ASSERT_FALSE(AutoGetCollectionForReadCommand(_opCtx, nss).getCollection()); } + + ASSERT_OK(createCollection(_opCtx, nss.db().toString(), BSON("create" << nss.coll()))); + + { ASSERT(AutoGetCollectionForReadCommand(_opCtx, nss).getCollection()); } + + BSONObj result = queryOplog(BSON("op" + << "c" + << "ns" + << nss.getCommandNS().ns() + << "o.create" + << nss.coll())); + repl::OplogEntry op(result); + // The logOp() call for createCollection should have timestamp 'futureTs', which will also + // be the timestamp at which we do the write which creates the collection. Thus we expect + // the collection to appear at 'futureTs' and not before. + ASSERT_EQ(op.getTimestamp(), futureTs) << op.toBSON(); + + assertNamespaceInIdents(nss, pastTs, false); + assertNamespaceInIdents(nss, presentTs, false); + assertNamespaceInIdents(nss, futureTs, true); + assertNamespaceInIdents(nss, indexCreateTs, true); + assertNamespaceInIdents(nss, nullTs, true); + + result = queryOplog(BSON("op" + << "c" + << "ns" + << nss.getCommandNS().ns() + << "o.createIndexes" + << nss.coll())); + repl::OplogEntry indexOp(result); + ASSERT_EQ(indexOp.getObject()["name"].str(), "user_1_db_1"); + ASSERT_GT(indexOp.getTimestamp(), futureTs) << op.toBSON(); + AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_IS, LockMode::MODE_IS); + auto kvStorageEngine = + dynamic_cast<KVStorageEngine*>(_opCtx->getServiceContext()->getStorageEngine()); + KVCatalog* kvCatalog = kvStorageEngine->getCatalog(); + auto indexIdent = kvCatalog->getIndexIdent(_opCtx, nss.ns(), "user_1_db_1"); + assertIdentsMissingAtTimestamp(kvCatalog, "", indexIdent, pastTs); + assertIdentsMissingAtTimestamp(kvCatalog, "", indexIdent, presentTs); + assertIdentsMissingAtTimestamp(kvCatalog, "", indexIdent, futureTs); + assertIdentsExistAtTimestamp(kvCatalog, "", indexIdent, indexCreateTs); + assertIdentsExistAtTimestamp(kvCatalog, "", indexIdent, nullTs); + } +}; + class AllStorageTimestampTests : public unittest::Suite { public: AllStorageTimestampTests() : unittest::Suite("StorageTimestampTests") {} @@ -2304,6 +2363,7 @@ public: add<TimestampIndexBuilderOnPrimary<true>>(); add<SecondaryReadsDuringBatchApplicationAreAllowed>(); add<ViewCreationSeparateTransaction>(); + add<CreateCollectionWithSystemIndex>(); } }; diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js index 74c677671bc..94e5e8b18b3 100644 --- a/src/mongo/shell/replsettest.js +++ b/src/mongo/shell/replsettest.js @@ -1816,6 +1816,7 @@ var ReplSetTest = function(opts) { // '_master' must have been populated. var primary = rst._master; var combinedDBs = new Set(primary.getDBNames()); + const replSetConfig = rst.getReplSetConfigFromNode(); slaves.forEach(secondary => { secondary.getDBNames().forEach(dbName => combinedDBs.add(dbName)); @@ -1915,8 +1916,10 @@ var ReplSetTest = function(opts) { // Check that the following collection stats are the same across replica set // members: // capped - // nindexes + // nindexes, except on nodes with buildIndexes: false // ns + const hasSecondaryIndexes = + replSetConfig.members[rst.getNodeId(secondary)].buildIndexes !== false; primaryCollections.forEach(collName => { var primaryCollStats = primary.getDB(dbName).runCommand({collStats: collName}); @@ -1928,7 +1931,8 @@ var ReplSetTest = function(opts) { printCollectionInfo('secondary', secondary, dbName, collName); success = false; } else if (primaryCollStats.capped !== secondaryCollStats.capped || - primaryCollStats.nindexes !== secondaryCollStats.nindexes || + (hasSecondaryIndexes && + primaryCollStats.nindexes !== secondaryCollStats.nindexes) || primaryCollStats.ns !== secondaryCollStats.ns) { print(msgPrefix + ', the primary and secondary have different stats for the ' + |